diff --git a/package-lock.json b/package-lock.json index db7cb36..cd76dc2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "txtdot", - "version": "1.5.2", + "version": "1.5.3", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "txtdot", - "version": "1.5.2", + "version": "1.5.3", "license": "MIT", "dependencies": { "@fastify/static": "^6.12.0", @@ -23,7 +23,8 @@ "ip-range-check": "^0.2.0", "json-schema-to-ts": "^3.0.0", "linkedom": "^0.16.6", - "micromatch": "^4.0.5" + "micromatch": "^4.0.5", + "route-parser": "^0.0.5" }, "devDependencies": { "@types/dompurify": "^3.0.5", @@ -31,6 +32,7 @@ "@types/jsdom": "^21.1.6", "@types/micromatch": "^4.0.6", "@types/node": "^20.10.6", + "@types/route-parser": "^0.1.7", "@typescript-eslint/eslint-plugin": "^6.18.0", "@typescript-eslint/parser": "^6.18.0", "clean-css-cli": "^5.6.3", @@ -414,6 +416,12 @@ "undici-types": "~5.26.4" } }, + "node_modules/@types/route-parser": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/@types/route-parser/-/route-parser-0.1.7.tgz", + "integrity": "sha512-haO+3HVio/4w+yuMJTjqfSo0ivOV8WnXaOReVD6QN729UGBEyizWNGc2Jd0OLsJDucIod4aJSsPLBeLj2uzMCQ==", + "dev": true + }, "node_modules/@types/semver": { "version": "7.5.6", "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.6.tgz", @@ -2964,6 +2972,14 @@ "node": "*" } }, + "node_modules/route-parser": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/route-parser/-/route-parser-0.0.5.tgz", + "integrity": "sha512-nsii+MXoNb7NyF05LP9kaktx6AoBVT/7zUgDnzIb5IoYAvYkbZOAuoLJjVdsyEVxWv0swCxWkKDK4cMva+WDBA==", + "engines": { + "node": ">= 0.9" + } + }, "node_modules/run-parallel": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", diff --git a/package.json b/package.json index fbd8187..0dc8b46 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "txtdot", - "version": "1.5.2", + "version": "1.5.3", "private": true, "description": "", "main": "dist/app.js", @@ -19,7 +19,8 @@ "ip-range-check": "^0.2.0", "json-schema-to-ts": "^3.0.0", "linkedom": "^0.16.6", - "micromatch": "^4.0.5" + "micromatch": "^4.0.5", + "route-parser": "^0.0.5" }, "devDependencies": { "@types/dompurify": "^3.0.5", @@ -27,6 +28,7 @@ "@types/jsdom": "^21.1.6", "@types/micromatch": "^4.0.6", "@types/node": "^20.10.6", + "@types/route-parser": "^0.1.7", "@typescript-eslint/eslint-plugin": "^6.18.0", "@typescript-eslint/parser": "^6.18.0", "clean-css-cli": "^5.6.3", diff --git a/src/handlers/distributor.ts b/src/handlers/distributor.ts new file mode 100644 index 0000000..5dc7009 --- /dev/null +++ b/src/handlers/distributor.ts @@ -0,0 +1,87 @@ +import { IHandlerOutput } from './handler.interface'; +import axios from '../types/axios'; + +import micromatch from 'micromatch'; + +import DOMPurify from 'dompurify'; + +import { Readable } from 'stream'; + +import isLocalResource from '../utils/islocal'; + +import { LocalResourceError, NotHtmlMimetypeError } from '../errors/main'; +import { HandlerInput } from './handler-input'; +import { decodeStream, parseEncodingName } from '../utils/http'; +import replaceHref from '../utils/replace-href'; +import { parseHTML } from 'linkedom'; +import { Engine } from './engine'; + +interface IEngineId { + [key: string]: number; +} + +export class Distributor { + engines_id: IEngineId = {}; + fallback: Engine[] = []; + list: string[] = []; + constructor() {} + + engine(engine: Engine) { + this.engines_id[engine.name] = this.list.length; + this.fallback.push(engine); + this.list.push(engine.name); + } + + async handlePage( + remoteUrl: string, // remote URL + requestUrl: URL, // proxy URL + engineName?: string, + redirectPath: string = 'get' + ): Promise { + const urlObj = new URL(remoteUrl); + + if (await isLocalResource(urlObj)) { + throw new LocalResourceError(); + } + + const response = await axios.get(remoteUrl); + const data: Readable = response.data; + const mime: string | undefined = + response.headers['content-type']?.toString(); + + if (mime && mime.indexOf('text/html') === -1) { + throw new NotHtmlMimetypeError(); + } + + const engine = this.getFallbackEngine(urlObj.hostname, engineName); + const output = await engine.handle( + new HandlerInput( + await decodeStream(data, parseEncodingName(mime)), + remoteUrl + ) + ); + + // post-process + + const dom = parseHTML(output.content); + replaceHref(dom, requestUrl, new URL(remoteUrl), engineName, redirectPath); + + const purify = DOMPurify(dom.window); + output.content = purify.sanitize(dom.document.toString()); + + return output; + } + + getFallbackEngine(host: string, specified?: string): Engine { + if (specified) { + return this.fallback[this.engines_id[specified]]; + } + for (const engine of this.fallback) { + if (micromatch.isMatch(host, engine.domains)) { + return engine; + } + } + + return this.fallback[0]; + } +} diff --git a/src/handlers/engine.ts b/src/handlers/engine.ts new file mode 100644 index 0000000..3cc54aa --- /dev/null +++ b/src/handlers/engine.ts @@ -0,0 +1,38 @@ +import Route from 'route-parser'; +import { HandlerInput } from './handler-input'; +import { IHandlerOutput } from './handler.interface'; +import { EngineParseError } from '../errors/main'; +import { EngineFunction } from '../types/handlers'; + +interface IRoute { + route: Route; + handler: EngineFunction; +} + +export class Engine { + name: string; + domains: string[]; + routes: IRoute[] = []; + constructor(name: string, domains: string[] = []) { + this.domains = domains; + this.name = name; + } + + route(path: string, handler: EngineFunction) { + this.routes.push({ route: new Route(path), handler: handler }); + } + + async handle(input: HandlerInput): Promise { + const url = new URL(input.getUrl()); + const path = url.pathname + url.search + url.hash; + for (const route of this.routes) { + const match = route.route.match(path); + + if (match) { + return await route.handler(input, match); + } + } + + throw new EngineParseError(`No handler for ${path}. [${this.name}]`); + } +} diff --git a/src/handlers/engines/readability.ts b/src/handlers/engines/readability.ts new file mode 100644 index 0000000..c8a7e79 --- /dev/null +++ b/src/handlers/engines/readability.ts @@ -0,0 +1,26 @@ +import { Readability } from '@mozilla/readability'; +import { EngineParseError } from '../../errors/main'; + +import { Engine } from '../engine'; + +const ReadabilityEngine = new Engine('Readability'); + +ReadabilityEngine.route('*path', async (input, req) => { + const reader = new Readability(input.parseDom().window.document); + const parsed = reader.parse(); + + if (!parsed) { + throw new EngineParseError( + `Parse error (${req.path}). [${ReadabilityEngine.name}]` + ); + } + + return { + content: parsed.content, + textContent: parsed.textContent, + title: parsed.title, + lang: parsed.lang, + }; +}); + +export default ReadabilityEngine; diff --git a/src/handlers/searx.ts b/src/handlers/engines/searx.ts similarity index 70% rename from src/handlers/searx.ts rename to src/handlers/engines/searx.ts index 2aeafe8..9f9ba3a 100644 --- a/src/handlers/searx.ts +++ b/src/handlers/engines/searx.ts @@ -1,29 +1,24 @@ -import { HandlerInput } from './handler-input'; -import { IHandlerOutput } from './handler.interface'; +import { Engine } from '../engine'; -export default async function searx( - input: HandlerInput -): Promise { +const SearXEngine = new Engine('SearX', ['searx.*']); + +SearXEngine.route('/search?q=:search', async (input, req) => { const document = input.parseDom().window.document; - - const search = document.getElementById('q') as HTMLTextAreaElement; - + const search = req.search; const url = new URL(input.getUrl()); - const page = parseInt(url.searchParams.get('pageno') || '1'); const page_footer = `${ page !== 1 - ? `Previous |` : '' - } Next`; const articles = Array.from(document.querySelectorAll('.result')); - const articles_parsed = articles.map((a) => { const parsed = { url: @@ -51,9 +46,9 @@ export default async function searx( return { content, textContent, - title: `${search.value} - Searx - Page ${page}`, + title: `${search} - Searx - Page ${page}`, lang: document.documentElement.lang, }; -} +}); -export const SearxDomains = ['searx.*']; +export default SearXEngine; diff --git a/src/handlers/engines/stackoverflow.ts b/src/handlers/engines/stackoverflow.ts new file mode 100644 index 0000000..e8fe6af --- /dev/null +++ b/src/handlers/engines/stackoverflow.ts @@ -0,0 +1,45 @@ +import { Engine } from '../engine'; + +const SOE = new Engine('StackOverflow', [ + 'stackoverflow.com', + '*.stackoverflow.com', + '*.stackexchange.com', + 'askubuntu.com', + 'stackapps.com', + 'mathoverflow.net', + 'superuser.com', + 'serverfault.com', +]); + +SOE.route('/questions/:id/:slug', async (input, req) => { + const document = input.parseDom().window.document; + + const questionEl = document.getElementById('question'); + const question = postParser(questionEl); + + const title = document.querySelector('.question-hyperlink')?.innerHTML || ''; + + const allAnswers = [...document.querySelectorAll('.answer')]; + const answers = allAnswers.map((a) => postParser(a)); + + return { + content: `${question}
${answers.length} answers
${answers.join( + '
' + )}`, + textContent: `${req.id}/${req.slug}\n`, + title, + lang: 'en', + }; +}); + +function postParser(el: Element | null): string { + if (!el) { + return ''; + } + const body = el.querySelector('.js-post-body')?.innerHTML || ''; + const voteCount = el.querySelector('.js-vote-count')?.textContent || ''; + + return `

${voteCount} votes

${body}`; +} + +export default SOE; diff --git a/src/handlers/google.ts b/src/handlers/google.ts deleted file mode 100644 index 392edd5..0000000 --- a/src/handlers/google.ts +++ /dev/null @@ -1,75 +0,0 @@ -import { HandlerInput } from './handler-input'; -import { IHandlerOutput } from './handler.interface'; -import { EngineParseError } from '../errors/main'; - -export default async function google( - input: HandlerInput -): Promise { - const window = input.parseDom().window; - - const googleAnchors = [ - ...window.document.querySelectorAll('a[jsname=UWckNb]'), - ] as HTMLAnchorElement[]; - - if (!googleAnchors) { - throw new EngineParseError( - 'Failed to find anchors in search result [google]' - ); - } - - const results = googleAnchors - .map((a: HTMLAnchorElement): GoogleProps => { - const parsedHref = new URL(new URL(a.href).searchParams.get('url')!); - return { - href: a.href!, - siteName: parsedHref.hostname, - heading: a.childNodes[1]?.textContent, - }; - }) - .filter((a) => a.heading); - - const convertToFormat = (result: GoogleProps, isHtml: boolean) => { - return isHtml - ? `

${result.siteName} - ${result.heading}

` - : `${result.siteName} - ${result.heading} > ${result.href}`; - }; - - const content = results.map((result) => { - return convertToFormat(result, true); - }); - - const textContent = results.map((result) => { - return convertToFormat(result, false); - }); - - const search = window.document.getElementById( - 'APjFqb' - ) as HTMLTextAreaElement; - - const searchForm = ` -
- - -
- `; - - return { - content: `${searchForm}${content.join('')}`, - textContent: textContent.join('\n'), - }; -} - -export const GoogleDomains = [ - 'google.*', - 'google.co.*', - 'google.com.*', - 'www.google.*', - 'www.google.co.*', - 'www.google.com.*', -]; - -interface GoogleProps { - href: string; - siteName: string; - heading: string | null; -} diff --git a/src/handlers/main.ts b/src/handlers/main.ts index 81a3636..59c819f 100644 --- a/src/handlers/main.ts +++ b/src/handlers/main.ts @@ -1,97 +1,13 @@ -import { IHandlerOutput } from './handler.interface'; -import { Engines, EngineFunction, EnginesMatch } from '../types/handlers'; -import axios from '../types/axios'; +import { Distributor } from './distributor'; +import Readability from './engines/readability'; +import SearX from './engines/searx'; +import StackOverflow from './engines/stackoverflow'; -import micromatch from 'micromatch'; +const distributor = new Distributor(); -import DOMPurify from 'dompurify'; +distributor.engine(Readability); +distributor.engine(SearX); +distributor.engine(StackOverflow); -import { Readable } from 'stream'; - -import readability from './readability'; -import google, { GoogleDomains } from './google'; -import stackoverflow, { StackOverflowDomains } from './stackoverflow/main'; -import searx, { SearxDomains } from './searx'; - -import isLocalResource from '../utils/islocal'; - -import { LocalResourceError, NotHtmlMimetypeError } from '../errors/main'; -import { HandlerInput } from './handler-input'; -import { decodeStream, parseEncodingName } from '../utils/http'; -import replaceHref from '../utils/replace-href'; -import { parseHTML } from 'linkedom'; - -export default async function handlePage( - remoteUrl: string, // remote URL - requestUrl: URL, // proxy URL - engine?: string, - redirectPath: string = 'get' -): Promise { - const urlObj = new URL(remoteUrl); - - if (await isLocalResource(urlObj)) { - throw new LocalResourceError(); - } - - const response = await axios.get(remoteUrl); - const data: Readable = response.data; - const mime: string | undefined = response.headers['content-type']?.toString(); - - if (mime && mime.indexOf('text/html') === -1) { - throw new NotHtmlMimetypeError(); - } - - const handler = getFallbackEngine(urlObj.hostname, engine); - const output = await handler( - new HandlerInput( - await decodeStream(data, parseEncodingName(mime)), - remoteUrl - ) - ); - - // post-process - - const dom = parseHTML(output.content); - replaceHref(dom, requestUrl, new URL(remoteUrl), engine, redirectPath); - - const purify = DOMPurify(dom.window); - output.content = purify.sanitize(dom.document.toString()); - - return output; -} - -function getFallbackEngine(host: string, specified?: string): EngineFunction { - if (specified) { - return engines[specified]; - } - for (const engine of fallback) { - if (micromatch.isMatch(host, engine.pattern)) { - return engine.engine; - } - } - return engines.readability; -} - -export const engines: Engines = { - readability, - google, - stackoverflow, - searx, -}; - -export const engineList: string[] = Object.keys(engines); - -export const fallback: EnginesMatch = [ - { - pattern: GoogleDomains, - engine: engines.google, - }, - { - pattern: StackOverflowDomains, - engine: engines.stackoverflow, - }, - { - pattern: SearxDomains, - engine: engines.searx, - }, -]; +export const engineList = distributor.list; +export default distributor; diff --git a/src/handlers/readability.ts b/src/handlers/readability.ts deleted file mode 100644 index f8e2fcb..0000000 --- a/src/handlers/readability.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { Readability } from '@mozilla/readability'; -import { HandlerInput } from './handler-input'; -import { IHandlerOutput } from './handler.interface'; -import { EngineParseError } from '../errors/main'; - -export default async function readability( - input: HandlerInput -): Promise { - const reader = new Readability(input.parseDom().window.document); - const parsed = reader.parse(); - - if (!parsed) { - throw new EngineParseError('Failed to parse [readability]'); - } - - return { - content: parsed.content, - textContent: parsed.textContent, - title: parsed.title, - lang: parsed.lang, - }; -} diff --git a/src/handlers/stackoverflow/main.ts b/src/handlers/stackoverflow/main.ts deleted file mode 100644 index f69344d..0000000 --- a/src/handlers/stackoverflow/main.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { HandlerInput } from '../handler-input'; -import { IHandlerOutput } from '../handler.interface'; -import { EngineParseError } from '../../errors/main'; -import qPostsHandler from './questions-posts'; - -export default async function stackoverflow( - input: HandlerInput -): Promise { - const window = input.parseDom().window; - - const url = new URL(window.location.href); - const path = url.pathname.split('/').filter((p) => p !== ''); - - let result: IHandlerOutput = { - content: '', - textContent: '', - title: '', - lang: '', - }; - - if (path[0] === 'questions') { - if (path.length === 3) { - result = await qPostsHandler(window); - } else if (path.length === 1) { - result.content = 'questions'; - } else { - throw new EngineParseError('Invalid URL [stackoverflow]'); - } - } - - return result; -} - -export const StackOverflowDomains = [ - 'stackoverflow.com', - '*.stackoverflow.com', - '*.stackexchange.com', - 'askubuntu.com', - 'stackapps.com', - 'mathoverflow.net', - 'superuser.com', - 'serverfault.com', -]; diff --git a/src/handlers/stackoverflow/post-parser.ts b/src/handlers/stackoverflow/post-parser.ts deleted file mode 100644 index 7b06a3e..0000000 --- a/src/handlers/stackoverflow/post-parser.ts +++ /dev/null @@ -1,9 +0,0 @@ -export default function postParser(el: Element | null): string { - if (!el) { - return ''; - } - const body = el.querySelector('.js-post-body')?.innerHTML || ''; - const voteCount = el.querySelector('.js-vote-count')?.textContent || ''; - - return `

${voteCount} votes

${body}`; -} diff --git a/src/handlers/stackoverflow/questions-posts.ts b/src/handlers/stackoverflow/questions-posts.ts deleted file mode 100644 index c1eb08c..0000000 --- a/src/handlers/stackoverflow/questions-posts.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { IHandlerOutput } from '../handler.interface'; -import postParser from './post-parser'; - -export default async function qPostsHandler( - window: Window -): Promise { - const questionEl = window.document.getElementById('question'); - const question = postParser(questionEl); - - const title = - window.document.querySelector('.question-hyperlink')?.innerHTML || ''; - - const allAnswers = [...window.document.querySelectorAll('.answer')]; - - const answers = allAnswers.map((a) => postParser(a)); - - return { - content: `${question}
${answers.length} answers
${answers.join( - '
' - )}`, - textContent: 'question', - title, - lang: 'en', - }; -} diff --git a/src/publicConfig.ts b/src/publicConfig.ts index 12820a4..e92b96e 100644 --- a/src/publicConfig.ts +++ b/src/publicConfig.ts @@ -1,5 +1,5 @@ export default { - version: '1.5.2', + version: '1.5.3', description: 'txtdot is an HTTP proxy that parses only text, links and pictures from pages reducing internet bandwidth usage, removing ads and heavy scripts', }; diff --git a/src/routes/api/parse.ts b/src/routes/api/parse.ts index bff8af4..51f4edd 100644 --- a/src/routes/api/parse.ts +++ b/src/routes/api/parse.ts @@ -6,7 +6,7 @@ import { parseSchema, } from '../../types/requests/api'; -import handlePage from '../../handlers/main'; +import distributor from '../../handlers/main'; import { generateRequestUrl } from '../../utils/generate'; export default async function parseRoute(fastify: FastifyInstance) { @@ -15,7 +15,7 @@ export default async function parseRoute(fastify: FastifyInstance) { { schema: parseSchema }, async (request: EngineRequest) => { return { - data: await handlePage( + data: await distributor.handlePage( request.query.url, generateRequestUrl( request.protocol, diff --git a/src/routes/api/raw-html.ts b/src/routes/api/raw-html.ts index 4deb197..51c66bd 100644 --- a/src/routes/api/raw-html.ts +++ b/src/routes/api/raw-html.ts @@ -2,7 +2,7 @@ import { FastifyInstance } from 'fastify'; import { IParseSchema, rawHtmlSchema } from '../../types/requests/api'; -import handlePage from '../../handlers/main'; +import distributor from '../../handlers/main'; import { generateRequestUrl } from '../../utils/generate'; export default async function rawHtml(fastify: FastifyInstance) { @@ -12,7 +12,7 @@ export default async function rawHtml(fastify: FastifyInstance) { async (request, reply) => { reply.type('text/html; charset=utf-8'); return ( - await handlePage( + await distributor.handlePage( request.query.url, generateRequestUrl( request.protocol, diff --git a/src/routes/browser/get.ts b/src/routes/browser/get.ts index cefce11..e5492dd 100644 --- a/src/routes/browser/get.ts +++ b/src/routes/browser/get.ts @@ -1,7 +1,7 @@ import { FastifyInstance } from 'fastify'; import { GetSchema, IGetSchema } from '../../types/requests/browser'; -import handlePage from '../../handlers/main'; +import distributor from '../../handlers/main'; import { generateRequestUrl } from '../../utils/generate'; import getConfig from '../../config/main'; @@ -14,7 +14,7 @@ export default async function getRoute(fastify: FastifyInstance) { const remoteUrl = request.query.url; const engine = request.query.engine; - const parsed = await handlePage( + const parsed = await distributor.handlePage( remoteUrl, generateRequestUrl( request.protocol, diff --git a/src/types/handlers.ts b/src/types/handlers.ts index 58138c7..4ddb92d 100644 --- a/src/types/handlers.ts +++ b/src/types/handlers.ts @@ -1,8 +1,9 @@ +import { Engine } from '../handlers/engine'; import { HandlerInput } from '../handlers/handler-input'; import { IHandlerOutput } from '../handlers/handler.interface'; export interface Engines { - [key: string]: EngineFunction; + [key: string]: Engine; } export type EngineMatch = { @@ -10,5 +11,12 @@ export type EngineMatch = { engine: EngineFunction; }; -export type EngineFunction = (input: HandlerInput) => Promise; +export interface RouteValues { + [key: string]: string; +} + +export type EngineFunction = ( + input: HandlerInput, + req: RouteValues +) => Promise; export type EnginesMatch = EngineMatch[];