Thank you for the script! FYI I ran into a little error on windows
line 823, in get_or_create_cached_image_record
date_or_none = date_first_seen.strftime('%s')
ValueError: Invalid format string
Got around it by adding import time with the other imports and changing line 823 (now 824) to date_or_none = time.mktime(date_first_seen.timetuple()) (fix stolen from here, dunno if it's a good fix tho)
EDIT: I also ended up making some more changes to download images hosted on Dreamwidth, also in their original resolution - patch file below in case its handy. Edit again: fix running ljdumptohtml.py alone, allow images to have attributes between <img and src="
Patch file
--- C:/dw/ljdump-1.7.4/ChangeLog Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ChangeLog Sat May 11 00:00:00 2024
--- C:/dw/ljdump-1.7.4/ljdump.config.sample Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdump.config.sample Sat May 11 00:00:00 2024
@@ -3,4 +3,5 @@
<server>https://livejournal.com</server>
<username>myaccount</username>
<password>mypassword</password>
+ <ljuniq>ljuniq cookie if you want to download Dreamwidth hosted images</ljuniq>
</ljdump>
--- C:/dw/ljdump-1.7.4/ljdump.py Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdump.py Sat May 11 00:00:00 2024
@@ -70,7 +70,7 @@
return e[0].firstChild.nodeValue
-def ljdump(journal_server, username, password, journal_short_name, verbose=True, stop_at_fifty=False, make_pages=False, cache_images=False, retry_images=True):
+def ljdump(journal_server, username, password, ljuniq, journal_short_name, verbose=True, stop_at_fifty=False, make_pages=False, cache_images=False, retry_images=True):
m = re.search("(.*)/interface/xmlrpc", journal_server)
if m:
@@ -417,6 +417,7 @@
ljdumptohtml(
username=username,
journal_short_name=journal_short_name,
+ ljuniq=ljuniq,
verbose=verbose,
cache_images=cache_images,
retry_images=retry_images
@@ -444,6 +445,11 @@
password = password_els[0].childNodes[0].data
else:
password = getpass("Password: ")
+ ljuniq_els = config.documentElement.getElementsByTagName("ljuniq")
+ if len(ljuniq_els) > 0:
+ ljuniq = ljuniq_els[0].childNodes[0].data
+ else:
+ ljuniq = getpass("ljuniq cookie (for Dreamwidth hosted image downloads, leave blank otherwise): ")
journals = [e.childNodes[0].data for e in config.documentElement.getElementsByTagName("journal")]
if not journals:
journals = [username]
@@ -457,6 +463,7 @@
print
username = input("Username: ")
password = getpass("Password: ")
+ ljuniq = getpass("ljuniq cookie (for Dreamwidth hosted image downloads, leave blank otherwise): ")
print
print("You may back up either your own journal, or a community.")
print("If you are a community maintainer, you can back up both entries and comments.")
@@ -474,6 +481,7 @@
journal_server=journal_server,
username=username,
password=password,
+ ljuniq=ljuniq,
journal_short_name=journal,
verbose=args.verbose,
stop_at_fifty=args.fifty,
--- C:/dw/ljdump-1.7.4/ljdumpsqlite.py Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdumpsqlite.py Sat May 11 00:00:00 2024
@@ -30,6 +30,8 @@
from sqlite3 import Error
from xml.sax import saxutils
from builtins import str
+import time
+import re
# Subclass of tzinfo swiped mostly from dateutil
@@ -803,6 +805,10 @@
SELECT id, url, filename, date_first_seen, date_last_attempted, cached FROM cached_images
WHERE url = :url""", {'url': image_url})
row = cur.fetchone()
+ pattern = re.compile('^https://(\w+).dreamwidth.org/file/\d+x\d+/(.+)')
+ if pattern.match(image_url):
+ result = pattern.search(image_url)
+ get_or_create_cached_image_record(cur, verbose, 'https://' + result.group(1) + '.dreamwidth.org/file/' + result.group(2), date_first_seen)
if row:
if verbose:
print('Found image cache record for: %s' % (image_url))
@@ -820,7 +826,7 @@
print('Creating image cache record for: %s' % (image_url))
date_or_none = None
if date_first_seen:
- date_or_none = date_first_seen.strftime('%s')
+ date_or_none = time.mktime(date_first_seen.timetuple())
data = {
"id": None,
"url": image_url,
--- C:/dw/ljdump-1.7.4/ljdumptohtml.py Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/ljdumptohtml.py Mon May 13 23:37:54 2024
@@ -385,12 +385,18 @@
def resolve_cached_image_references(content, image_urls_to_filenames):
# Find any image URLs
- urls_found = re.findall(r'img[^\"\'()<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', content, flags=re.IGNORECASE)
+ urls_found = re.findall(r'<img[^<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', content, flags=re.IGNORECASE)
# Find the set of URLs that have been resolved to local files
resolved_urls = []
for image_url in urls_found:
if image_url in image_urls_to_filenames:
resolved_urls.append(image_url)
+ pattern = re.compile('^https://(\w+).dreamwidth.org/file/\d+x\d+/(.+)')
+ if pattern.match(image_url):
+ result = pattern.search(image_url)
+ full_url = 'https://' + result.group(1) + '.dreamwidth.org/file/' + result.group(2)
+ if full_url in image_urls_to_filenames:
+ resolved_urls.append(full_url)
# Swap them in
for image_url in resolved_urls:
filename = image_urls_to_filenames[image_url]
@@ -630,10 +636,15 @@
return html_as_string
-def download_entry_image(img_url, journal_short_name, subfolder, url_id):
+def download_entry_image(img_url, journal_short_name, subfolder, url_id, entry_url, ljuniq):
try:
- image_req = urllib.request.urlopen(img_url, timeout = 5)
+ pattern = re.compile('^https://(\w+).dreamwidth.org/file/')
+ headers = {}
+ if pattern.match(img_url):
+ headers = {'Referer': entry_url, 'Cookie': "ljuniq="+ljuniq}
+ image_req = urllib.request.urlopen(urllib.request.Request(img_url, headers = headers), timeout = 5)
if image_req.headers.get_content_maintype() != 'image':
+ print('Content type was not expected, image skipped: ', img_url, image_req.headers.get_content_maintype())
return (1, None)
extension = MimeExtensions.get(image_req.info()["Content-Type"], "")
@@ -681,7 +692,7 @@
return (1, None)
-def ljdumptohtml(username, journal_short_name, verbose=True, cache_images=True, retry_images=True):
+def ljdumptohtml(username, journal_short_name, ljuniq, verbose=True, cache_images=True, retry_images=True):
if verbose:
print("Starting conversion for: %s" % journal_short_name)
@@ -741,8 +752,17 @@
e_id = entry['itemid']
entry_date = datetime.utcfromtimestamp(entry['eventtime_unix'])
entry_body = entry['event']
- urls_found = re.findall(r'img[^\"\'()<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', entry_body, flags=re.IGNORECASE)
+ initial_urls_found = re.findall(r'<img[^<>]*\ssrc\s?=\s?[\'\"](https?:/+[^\s\"\'()<>]+)[\'\"]', entry_body, flags=re.IGNORECASE)
subfolder = entry_date.strftime("%Y-%m")
+ urls_found = []
+ for image_url in initial_urls_found:
+ urls_found.append(image_url)
+ pattern = re.compile('^https://(\w+).dreamwidth.org/file/\d+x\d+/(.+)')
+ if pattern.match(image_url):
+ result = pattern.search(image_url)
+ full_url = 'https://' + result.group(1) + '.dreamwidth.org/file/' + result.group(2)
+ urls_found.append(full_url)
+
for image_url in urls_found:
cached_image = get_or_create_cached_image_record(cur, verbose, image_url, entry_date)
try_cache = True
@@ -758,7 +778,7 @@
image_id = cached_image['id']
cache_result = 0
img_filename = None
- (cache_result, img_filename) = download_entry_image(image_url, journal_short_name, subfolder, image_id)
+ (cache_result, img_filename) = download_entry_image(image_url, journal_short_name, subfolder, image_id, entry['url'], ljuniq)
if (cache_result == 0) and (img_filename is not None):
report_image_as_cached(cur, verbose, image_id, img_filename, entry_date)
image_resolve_max -= 1
@@ -955,6 +975,11 @@
journals = [e.childNodes[0].data for e in config.documentElement.getElementsByTagName("journal")]
if not journals:
journals = [username]
+ ljuniq_els = config.documentElement.getElementsByTagName("ljuniq")
+ if len(ljuniq_els) > 0:
+ ljuniq = ljuniq_els[0].childNodes[0].data
+ else:
+ ljuniq = getpass("ljuniq cookie (for Dreamwidth hosted image downloads, leave blank otherwise): ")
else:
print("ljdumptohtml - livejournal (or Dreamwidth, etc) archive to html utility")
print
@@ -976,6 +1001,7 @@
ljdumptohtml(
username=username,
journal_short_name=journal,
+ ljuniq=ljuniq,
verbose=args.verbose,
cache_images=args.cache_images,
retry_images=args.retry_images
--- C:/dw/ljdump-1.7.4/README.md Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/README.md Sat May 11 00:00:00 2024
--- C:/dw/ljdump-1.7.4/stylesheet.css Sat May 11 00:00:00 2024
+++ C:/dw/ljdump-1.7.4-patched/stylesheet.css Sat May 11 00:00:00 2024
<3
Date: 2024-05-11 10:51 pm (UTC)Got around it by adding
import time
with the other imports and changing line 823 (now 824) todate_or_none = time.mktime(date_first_seen.timetuple())
(fix stolen from here, dunno if it's a good fix tho)EDIT: I also ended up making some more changes to download images hosted on Dreamwidth, also in their original resolution - patch file below in case its handy.
Edit again: fix running ljdumptohtml.py alone, allow images to have attributes between <img and src="
Patch file