<3

Date: 2024-05-11 10:51 pm (UTC)
quietmoment: A photo of a symmetrical pattern in lace trim. Photo is in the public domain (0)
From: [personal profile] quietmoment
Thank you for the script! FYI I ran into a little error on windows

line 823, in get_or_create_cached_image_record
    date_or_none = date_first_seen.strftime('%s')
ValueError: Invalid format string


Got around it by adding import time with the other imports and changing line 823 (now 824) to date_or_none = time.mktime(date_first_seen.timetuple()) (fix stolen from here, dunno if it's a good fix tho)

EDIT: I also ended up making some more changes to download images hosted on Dreamwidth, also in their original resolution - patch file below in case its handy.
Edit again: fix running ljdumptohtml.py alone, allow images to have attributes between <img and src="



Patch file

--- C:/dw/ljdump-1.7.4/ChangeLog	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ChangeLog	Sat May 11 00:00:00 2024
--- C:/dw/ljdump-1.7.4/ljdump.config.sample	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdump.config.sample	Sat May 11 00:00:00 2024
@@ -3,4 +3,5 @@
     <server>https://livejournal.com</server>
     <username>myaccount</username>
     <password>mypassword</password>
+    <ljuniq>ljuniq cookie if you want to download Dreamwidth hosted images</ljuniq>
 </ljdump>
--- C:/dw/ljdump-1.7.4/ljdump.py	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdump.py	Sat May 11 00:00:00 2024
@@ -70,7 +70,7 @@
     return e[0].firstChild.nodeValue
 
 
-def ljdump(journal_server, username, password, journal_short_name, verbose=True, stop_at_fifty=False, make_pages=False, cache_images=False, retry_images=True):
+def ljdump(journal_server, username, password, ljuniq, journal_short_name, verbose=True, stop_at_fifty=False, make_pages=False, cache_images=False, retry_images=True):
 
     m = re.search("(.*)/interface/xmlrpc", journal_server)
     if m:
@@ -417,6 +417,7 @@
         ljdumptohtml(
             username=username,
             journal_short_name=journal_short_name,
+            ljuniq=ljuniq,
             verbose=verbose,
             cache_images=cache_images,
             retry_images=retry_images
@@ -444,6 +445,11 @@
             password = password_els[0].childNodes[0].data
         else:
             password = getpass("Password: ")
+        ljuniq_els = config.documentElement.getElementsByTagName("ljuniq")
+        if len(ljuniq_els) > 0:
+            ljuniq = ljuniq_els[0].childNodes[0].data
+        else:
+            ljuniq = getpass("ljuniq cookie (for Dreamwidth hosted image downloads, leave blank otherwise): ")
         journals = [e.childNodes[0].data for e in config.documentElement.getElementsByTagName("journal")]
         if not journals:
             journals = [username]
@@ -457,6 +463,7 @@
         print
         username = input("Username: ")
         password = getpass("Password: ")
+        ljuniq = getpass("ljuniq cookie (for Dreamwidth hosted image downloads, leave blank otherwise): ")
         print
         print("You may back up either your own journal, or a community.")
         print("If you are a community maintainer, you can back up both entries and comments.")
@@ -474,6 +481,7 @@
             journal_server=journal_server,
             username=username,
             password=password,
+            ljuniq=ljuniq,
             journal_short_name=journal,
             verbose=args.verbose,
             stop_at_fifty=args.fifty,
--- C:/dw/ljdump-1.7.4/ljdumpsqlite.py	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdumpsqlite.py	Sat May 11 00:00:00 2024
@@ -30,6 +30,8 @@
 from sqlite3 import Error
 from xml.sax import saxutils
 from builtins import str
+import time
+import re
 
 
 # Subclass of tzinfo swiped mostly from dateutil
@@ -803,6 +805,10 @@
         SELECT id, url, filename, date_first_seen, date_last_attempted, cached FROM cached_images
         WHERE url = :url""", {'url': image_url})
     row = cur.fetchone()
+    pattern = re.compile('^https://(\w+).dreamwidth.org/file/\d+x\d+/(.+)')
+    if pattern.match(image_url):
+        result = pattern.search(image_url)
+        get_or_create_cached_image_record(cur, verbose, 'https://' + result.group(1) + '.dreamwidth.org/file/' + result.group(2), date_first_seen)
     if row:
         if verbose:
             print('Found image cache record for: %s' % (image_url))
@@ -820,7 +826,7 @@
             print('Creating image cache record for: %s' % (image_url))
         date_or_none = None
         if date_first_seen:
-            date_or_none = date_first_seen.strftime('%s')
+            date_or_none = time.mktime(date_first_seen.timetuple())
         data = {
             "id": None,
             "url": image_url,
--- C:/dw/ljdump-1.7.4/ljdumptohtml.py	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdumptohtml.py	Mon May 13 23:37:54 2024
@@ -385,12 +385,18 @@
 
 def resolve_cached_image_references(content, image_urls_to_filenames):
     # Find any image URLs
-    urls_found = re.findall(r'img[^\"\'()<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', content, flags=re.IGNORECASE)
+    urls_found = re.findall(r'<img[^<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', content, flags=re.IGNORECASE)
     # Find the set of URLs that have been resolved to local files
     resolved_urls = []
     for image_url in urls_found:
         if image_url in image_urls_to_filenames:
             resolved_urls.append(image_url)
+        pattern = re.compile('^https://(\w+).dreamwidth.org/file/\d+x\d+/(.+)')
+        if pattern.match(image_url):
+            result = pattern.search(image_url)
+            full_url = 'https://' + result.group(1) + '.dreamwidth.org/file/' + result.group(2)
+            if full_url in image_urls_to_filenames:
+                resolved_urls.append(full_url)
     # Swap them in
     for image_url in resolved_urls:
         filename = image_urls_to_filenames[image_url]
@@ -630,10 +636,15 @@
     return html_as_string
 
 
-def download_entry_image(img_url, journal_short_name, subfolder, url_id):
+def download_entry_image(img_url, journal_short_name, subfolder, url_id, entry_url, ljuniq):
     try:
-        image_req = urllib.request.urlopen(img_url, timeout = 5)
+        pattern = re.compile('^https://(\w+).dreamwidth.org/file/')
+        headers = {}
+        if pattern.match(img_url):
+            headers = {'Referer': entry_url, 'Cookie': "ljuniq="+ljuniq}
+        image_req = urllib.request.urlopen(urllib.request.Request(img_url, headers = headers), timeout = 5)
         if image_req.headers.get_content_maintype() != 'image':
+            print('Content type was not expected, image skipped: ', img_url, image_req.headers.get_content_maintype())
             return (1, None)
         extension = MimeExtensions.get(image_req.info()["Content-Type"], "")
 
@@ -681,7 +692,7 @@
         return (1, None)
 
 
-def ljdumptohtml(username, journal_short_name, verbose=True, cache_images=True, retry_images=True):
+def ljdumptohtml(username, journal_short_name, ljuniq, verbose=True, cache_images=True, retry_images=True):
     if verbose:
         print("Starting conversion for: %s" % journal_short_name)
 
@@ -741,8 +752,17 @@
                 e_id = entry['itemid']
                 entry_date = datetime.utcfromtimestamp(entry['eventtime_unix'])
                 entry_body = entry['event']
-                urls_found = re.findall(r'img[^\"\'()<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', entry_body, flags=re.IGNORECASE)
+                initial_urls_found = re.findall(r'<img[^<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', entry_body, flags=re.IGNORECASE)
                 subfolder = entry_date.strftime("%Y-%m")
+                urls_found = []
+                for image_url in initial_urls_found:
+                    urls_found.append(image_url)
+                    pattern = re.compile('^https://(\w+).dreamwidth.org/file/\d+x\d+/(.+)')
+                    if pattern.match(image_url):
+                        result = pattern.search(image_url)
+                        full_url = 'https://' + result.group(1) + '.dreamwidth.org/file/' + result.group(2)
+                        urls_found.append(full_url)
+
                 for image_url in urls_found:
                     cached_image = get_or_create_cached_image_record(cur, verbose, image_url, entry_date)
                     try_cache = True
@@ -758,7 +778,7 @@
                         image_id = cached_image['id']
                         cache_result = 0
                         img_filename = None
-                        (cache_result, img_filename) = download_entry_image(image_url, journal_short_name, subfolder, image_id)
+                        (cache_result, img_filename) = download_entry_image(image_url, journal_short_name, subfolder, image_id, entry['url'], ljuniq)
                         if (cache_result == 0) and (img_filename is not None):
                             report_image_as_cached(cur, verbose, image_id, img_filename, entry_date)
                             image_resolve_max -= 1
@@ -955,6 +975,11 @@
         journals = [e.childNodes[0].data for e in config.documentElement.getElementsByTagName("journal")]
         if not journals:
             journals = [username]
+        ljuniq_els = config.documentElement.getElementsByTagName("ljuniq")
+        if len(ljuniq_els) > 0:
+            ljuniq = ljuniq_els[0].childNodes[0].data
+        else:
+            ljuniq = getpass("ljuniq cookie (for Dreamwidth hosted image downloads, leave blank otherwise): ")
     else:
         print("ljdumptohtml - livejournal (or Dreamwidth, etc) archive to html utility")
         print
@@ -976,6 +1001,7 @@
         ljdumptohtml(
             username=username,
             journal_short_name=journal,
+            ljuniq=ljuniq,
             verbose=args.verbose,
             cache_images=args.cache_images,
             retry_images=args.retry_images
--- C:/dw/ljdump-1.7.4/README.md	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/README.md	Sat May 11 00:00:00 2024
--- C:/dw/ljdump-1.7.4/stylesheet.css	Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/stylesheet.css	Sat May 11 00:00:00 2024



(will be screened)
(will be screened if not validated)
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

Profile

garote: (Default)
garote

May 2025

S M T W T F S
    123
45678910
11121314151617
18192021222324
252627 28293031

Most Popular Tags

Page generated May. 29th, 2025 11:01 pm