Tumblr support, proper xml parsing

This commit is contained in:
dakedres 2023-08-05 14:38:05 -04:00
parent c0e45d3d30
commit aacf2483dd
5 changed files with 210 additions and 51 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -104,9 +104,23 @@ const sources = {
]
}
const endPage = `
<html>
<body>
<center>
<img src="end.jpg">
<h4>You have reached the end</h4>
<hr>
</center>
</body>
</html>`
module.exports = {
feeds,
sources,
pageSize,
courtesyWait
courtesyWait,
endPage
}

197
index.js
View File

@ -1,7 +1,7 @@
const { fetch } = require('node-fetch')
const fetch = require('node-fetch')
const config = require('./config.js')
let cache = require('./cache.json')
const Path = require('path')
const { JSDOM } = require('jsdom')
let waitingList = new Map()
@ -29,24 +29,18 @@ const handleNitterUser = async user => {
while(!data && index < sources.length) {
let source = sources[index]
if(waitingList.get(source)) {
console.log('Waiting...')
await sleep(config.courtesyWait)
waitingList.set(source, false)
}
let rss = await fetch('https://' + source + '/' + user + "/rss")
.catch(console.error)
.then(r => r.text() )
waitingList.set(source, true)
let rss = await fetchRss(source, user + '/rss')
try {
data = processNitter(user, rss)
data = processNitter(rss, user)
} catch(err) {
console.log(`Failed to fetch ${user} from ${source}`)
index++
if(err.constructor.name == NoMatchesError.name) {
console.log(`Failed to fetch ${user} from ${source}`)
index++
} else {
console.error(err)
break
}
}
}
@ -57,51 +51,138 @@ const handleNitterUser = async user => {
const sleep = delay => new Promise(resolve => setTimeout(() => resolve(), delay) )
const processNitter = (user, rss) => {
const descriptionMatches = getMatches(
new RegExp(`\
<item>.*?\
<dc:creator>@${user}<\/dc:creator>.*?\
<description>(.+?)<\/description>.*?\
<pubDate>(.+?)</pubDate>.*?\
<link>(.*?)<\/link>`, 'sg')
)(rss)
class NoMatchesError extends Error {}
if(descriptionMatches.length == 0) {
throw new Error('Got no matches')
return
const processRss = (rss, reducerCallback, cdata) => {
let { document } = new JSDOM(rss, {
contentType: 'text/xml'
}).window
let items = document.querySelectorAll('channel item')
if(items.length == 0) {
throw new NoMatchesError('Got no matches')
}
const getImageMatches = getMatches(/<img src="([^]*?)"/g)
let posts = []
for(let [, description, date, link] of descriptionMatches) {
let images = []
for(let item of items) {
let description = new JSDOM(item.querySelector('description').textContent).window.document
// let description = item.querySelector('description')
let dateString = item.querySelector('pubDate').textContent
let link = item.querySelector('link').textContent
for(let [, url] of getImageMatches(description) ) {
images.push(url)
}
let post = reducerCallback(item, description, dateString, link)
if(images.length > 0) {
posts.push({
user,
images,
date: new Date(date).valueOf(),
link
})
if(post) {
post.date = new Date(dateString).valueOf() ?? 0
post.link = link
posts.push(post)
}
}
return posts
}
const fetchRss = async (hostname, path) => {
let waitFor = waitingList.get(hostname)
if(waitFor !== 0) {
await sleep(waitFor)
waitingList.set(hostname, 0)
}
return await fetch(new URL(path, 'https://' + hostname))
.then(response => {
waitingList.set(hostname, config.courtesyWait)
return response.text()
})
.catch(console.error)
}
const getImages = (user, description) => {
let images = description.querySelectorAll('img')
if(images) {
let imageUrls = []
for(let image of images) {
let { src } = image
if(!src) {
let finalSrc = image.srcset.split(', ').pop()
src = finalSrc.slice(0, finalSrc.indexOf(' ') )
}
imageUrls.push(src)
}
if(imageUrls.length > 0) {
return {
images: imageUrls,
user
}
}
}
}
const processNitter = (rss, user) => {
return processRss(rss, (item, description) => {
// if(dcCreatorRegex.test(item))
// return
// let images = []
// for(let [, url] of getImageMatches(description) ) {
// images.push(url)
// }
// if(images.length > 0) {
// return { images, user }
// }
let creator = item.getElementsByTagName('dc:creator')[0]
if(creator.innerHTML.slice(1) === user)
return getImages(user, description)
}, true)
}
const handleTumblrUser = async (user) => {
let rss = await fetchRss(user + '.tumblr.com', 'rss')
console.log('Found ' + user)
return processTumblr(rss, user)
}
const processTumblr = (rss, user) => {
// const unescapedRss = unescape(rss)
return processRss(rss, (item, description) => {
let reblog = description.querySelector('p > a.tumblr_blog')
// If it's a reblog, skip it
if(reblog && reblog.innerHTML !== user) {
return
}
return getImages(user, description)
})
}
const oneDay = 1000 * 60 * 60 * 24
const print = async feeds => {
// Coalate
let masterFeed = []
let tooLongAgo = (Date.now() - (Date.now() % oneDay)) - oneDay * config.tooLongAgo
for(let feed of feeds) {
masterFeed = masterFeed.concat(feed)
for(let post of feed) {
if(tooLongAgo && post.date > tooLongAgo)
masterFeed.push(post)
}
}
masterFeed = masterFeed.sort((a, b) => a.date < b.date)
@ -118,12 +199,11 @@ const print = async feeds => {
console.log('Writing...')
for(let i = 0; i < pages.length; i++) {
Bun.write('out/' + (i == 0 ? 'index' : i) + '.html', renderPage(pages[i], i) )
Bun.write('out/' + (i == 0 ? 'index' : i) + '.html', renderPage(pages[i], i, pages.length) )
}
Bun.write('cache.json', JSON.stringify(cache, null, 2))
}
const renderPage = (posts, index) => {
const renderPage = (posts, index, pageCount) => {
let html = `\
<html>
<head>
@ -133,6 +213,7 @@ const renderPage = (posts, index) => {
body {
max-width: 640px;
float: right;
font-family: sans-serif;
}
p {
@ -161,12 +242,19 @@ const renderPage = (posts, index) => {
html += `\
${post.images.map(renderImage).join('\n')}
<p><b>${post.user}</b> ${date.getMonth()}/${date.getDay()}/${date.getFullYear()} <a href="${post.link}">open</a></p><hr>\n`
<p><b>${post.user}</b> ${date.getMonth()}/${date.getDate()}/${date.getFullYear()} <a href="${post.link}">open</a></p><hr>\n`
}
let nextPage = index + 1
let link = nextPage === pageCount ?
`<a href="data:text/html,">end</a>` :
`<a href="${nextPage}.html">next</a>`
html += `
<footer>
<a href="${index + 1}.html">next</a>
${link}
</footer>
</body>
</html>`
@ -183,6 +271,15 @@ const main = async () => {
for(let user of config.feeds.nitter) {
feeds.push(await handleNitterUser(user) )
}
console.log('Caching sources...')
Bun.write('cache.json', JSON.stringify(cache, null, 2))
for(let user of config.feeds.tumblr) {
feeds.push(await handleTumblrUser(user) )
}
await print(feeds)
console.log('Done!')
}
main()

View File

@ -9,6 +9,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"jsdom": "^22.1.0",
"node-fetch": "^3.3.1"
}
}

47
yarn.lock Normal file
View File

@ -0,0 +1,47 @@
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
# yarn lockfile v1
data-uri-to-buffer@^4.0.0:
version "4.0.1"
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz#d8feb2b2881e6a4f58c2e08acfd0e2834e26222e"
integrity sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==
fetch-blob@^3.1.2, fetch-blob@^3.1.4:
version "3.2.0"
resolved "https://registry.yarnpkg.com/fetch-blob/-/fetch-blob-3.2.0.tgz#f09b8d4bbd45adc6f0c20b7e787e793e309dcce9"
integrity sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==
dependencies:
node-domexception "^1.0.0"
web-streams-polyfill "^3.0.3"
formdata-polyfill@^4.0.10:
version "4.0.10"
resolved "https://registry.yarnpkg.com/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz#24807c31c9d402e002ab3d8c720144ceb8848423"
integrity sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==
dependencies:
fetch-blob "^3.1.2"
html-escaper@^3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-3.0.3.tgz#4d336674652beb1dcbc29ef6b6ba7f6be6fdfed6"
integrity sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==
node-domexception@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5"
integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==
node-fetch@^3.3.1:
version "3.3.2"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-3.3.2.tgz#d1e889bacdf733b4ff3b2b243eb7a12866a0b78b"
integrity sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==
dependencies:
data-uri-to-buffer "^4.0.0"
fetch-blob "^3.1.4"
formdata-polyfill "^4.0.10"
web-streams-polyfill@^3.0.3:
version "3.2.1"
resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz#71c2718c52b45fd49dbeee88634b3a60ceab42a6"
integrity sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==