Engine routing (#82)

* refactor: move engines to folder

* refactor: engine class

* refactor: add distributor and readability, searx engines class

* delete: google engine

useless since searx added

* fix: stackoverflow

* update version
This commit is contained in:
Artemy Egorov 2024-02-17 15:25:54 +03:00 committed by GitHub
parent 92ab68c587
commit c9f9e48acb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 256 additions and 297 deletions

22
package-lock.json generated
View File

@ -1,12 +1,12 @@
{ {
"name": "txtdot", "name": "txtdot",
"version": "1.5.2", "version": "1.5.3",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "txtdot", "name": "txtdot",
"version": "1.5.2", "version": "1.5.3",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@fastify/static": "^6.12.0", "@fastify/static": "^6.12.0",
@ -23,7 +23,8 @@
"ip-range-check": "^0.2.0", "ip-range-check": "^0.2.0",
"json-schema-to-ts": "^3.0.0", "json-schema-to-ts": "^3.0.0",
"linkedom": "^0.16.6", "linkedom": "^0.16.6",
"micromatch": "^4.0.5" "micromatch": "^4.0.5",
"route-parser": "^0.0.5"
}, },
"devDependencies": { "devDependencies": {
"@types/dompurify": "^3.0.5", "@types/dompurify": "^3.0.5",
@ -31,6 +32,7 @@
"@types/jsdom": "^21.1.6", "@types/jsdom": "^21.1.6",
"@types/micromatch": "^4.0.6", "@types/micromatch": "^4.0.6",
"@types/node": "^20.10.6", "@types/node": "^20.10.6",
"@types/route-parser": "^0.1.7",
"@typescript-eslint/eslint-plugin": "^6.18.0", "@typescript-eslint/eslint-plugin": "^6.18.0",
"@typescript-eslint/parser": "^6.18.0", "@typescript-eslint/parser": "^6.18.0",
"clean-css-cli": "^5.6.3", "clean-css-cli": "^5.6.3",
@ -414,6 +416,12 @@
"undici-types": "~5.26.4" "undici-types": "~5.26.4"
} }
}, },
"node_modules/@types/route-parser": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/@types/route-parser/-/route-parser-0.1.7.tgz",
"integrity": "sha512-haO+3HVio/4w+yuMJTjqfSo0ivOV8WnXaOReVD6QN729UGBEyizWNGc2Jd0OLsJDucIod4aJSsPLBeLj2uzMCQ==",
"dev": true
},
"node_modules/@types/semver": { "node_modules/@types/semver": {
"version": "7.5.6", "version": "7.5.6",
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.6.tgz", "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.6.tgz",
@ -2964,6 +2972,14 @@
"node": "*" "node": "*"
} }
}, },
"node_modules/route-parser": {
"version": "0.0.5",
"resolved": "https://registry.npmjs.org/route-parser/-/route-parser-0.0.5.tgz",
"integrity": "sha512-nsii+MXoNb7NyF05LP9kaktx6AoBVT/7zUgDnzIb5IoYAvYkbZOAuoLJjVdsyEVxWv0swCxWkKDK4cMva+WDBA==",
"engines": {
"node": ">= 0.9"
}
},
"node_modules/run-parallel": { "node_modules/run-parallel": {
"version": "1.2.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",

View File

@ -1,6 +1,6 @@
{ {
"name": "txtdot", "name": "txtdot",
"version": "1.5.2", "version": "1.5.3",
"private": true, "private": true,
"description": "", "description": "",
"main": "dist/app.js", "main": "dist/app.js",
@ -19,7 +19,8 @@
"ip-range-check": "^0.2.0", "ip-range-check": "^0.2.0",
"json-schema-to-ts": "^3.0.0", "json-schema-to-ts": "^3.0.0",
"linkedom": "^0.16.6", "linkedom": "^0.16.6",
"micromatch": "^4.0.5" "micromatch": "^4.0.5",
"route-parser": "^0.0.5"
}, },
"devDependencies": { "devDependencies": {
"@types/dompurify": "^3.0.5", "@types/dompurify": "^3.0.5",
@ -27,6 +28,7 @@
"@types/jsdom": "^21.1.6", "@types/jsdom": "^21.1.6",
"@types/micromatch": "^4.0.6", "@types/micromatch": "^4.0.6",
"@types/node": "^20.10.6", "@types/node": "^20.10.6",
"@types/route-parser": "^0.1.7",
"@typescript-eslint/eslint-plugin": "^6.18.0", "@typescript-eslint/eslint-plugin": "^6.18.0",
"@typescript-eslint/parser": "^6.18.0", "@typescript-eslint/parser": "^6.18.0",
"clean-css-cli": "^5.6.3", "clean-css-cli": "^5.6.3",

View File

@ -0,0 +1,87 @@
import { IHandlerOutput } from './handler.interface';
import axios from '../types/axios';
import micromatch from 'micromatch';
import DOMPurify from 'dompurify';
import { Readable } from 'stream';
import isLocalResource from '../utils/islocal';
import { LocalResourceError, NotHtmlMimetypeError } from '../errors/main';
import { HandlerInput } from './handler-input';
import { decodeStream, parseEncodingName } from '../utils/http';
import replaceHref from '../utils/replace-href';
import { parseHTML } from 'linkedom';
import { Engine } from './engine';
interface IEngineId {
[key: string]: number;
}
export class Distributor {
engines_id: IEngineId = {};
fallback: Engine[] = [];
list: string[] = [];
constructor() {}
engine(engine: Engine) {
this.engines_id[engine.name] = this.list.length;
this.fallback.push(engine);
this.list.push(engine.name);
}
async handlePage(
remoteUrl: string, // remote URL
requestUrl: URL, // proxy URL
engineName?: string,
redirectPath: string = 'get'
): Promise<IHandlerOutput> {
const urlObj = new URL(remoteUrl);
if (await isLocalResource(urlObj)) {
throw new LocalResourceError();
}
const response = await axios.get(remoteUrl);
const data: Readable = response.data;
const mime: string | undefined =
response.headers['content-type']?.toString();
if (mime && mime.indexOf('text/html') === -1) {
throw new NotHtmlMimetypeError();
}
const engine = this.getFallbackEngine(urlObj.hostname, engineName);
const output = await engine.handle(
new HandlerInput(
await decodeStream(data, parseEncodingName(mime)),
remoteUrl
)
);
// post-process
const dom = parseHTML(output.content);
replaceHref(dom, requestUrl, new URL(remoteUrl), engineName, redirectPath);
const purify = DOMPurify(dom.window);
output.content = purify.sanitize(dom.document.toString());
return output;
}
getFallbackEngine(host: string, specified?: string): Engine {
if (specified) {
return this.fallback[this.engines_id[specified]];
}
for (const engine of this.fallback) {
if (micromatch.isMatch(host, engine.domains)) {
return engine;
}
}
return this.fallback[0];
}
}

38
src/handlers/engine.ts Normal file
View File

@ -0,0 +1,38 @@
import Route from 'route-parser';
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { EngineParseError } from '../errors/main';
import { EngineFunction } from '../types/handlers';
interface IRoute {
route: Route;
handler: EngineFunction;
}
export class Engine {
name: string;
domains: string[];
routes: IRoute[] = [];
constructor(name: string, domains: string[] = []) {
this.domains = domains;
this.name = name;
}
route(path: string, handler: EngineFunction) {
this.routes.push({ route: new Route(path), handler: handler });
}
async handle(input: HandlerInput): Promise<IHandlerOutput> {
const url = new URL(input.getUrl());
const path = url.pathname + url.search + url.hash;
for (const route of this.routes) {
const match = route.route.match(path);
if (match) {
return await route.handler(input, match);
}
}
throw new EngineParseError(`No handler for ${path}. [${this.name}]`);
}
}

View File

@ -0,0 +1,26 @@
import { Readability } from '@mozilla/readability';
import { EngineParseError } from '../../errors/main';
import { Engine } from '../engine';
const ReadabilityEngine = new Engine('Readability');
ReadabilityEngine.route('*path', async (input, req) => {
const reader = new Readability(input.parseDom().window.document);
const parsed = reader.parse();
if (!parsed) {
throw new EngineParseError(
`Parse error (${req.path}). [${ReadabilityEngine.name}]`
);
}
return {
content: parsed.content,
textContent: parsed.textContent,
title: parsed.title,
lang: parsed.lang,
};
});
export default ReadabilityEngine;

View File

@ -1,29 +1,24 @@
import { HandlerInput } from './handler-input'; import { Engine } from '../engine';
import { IHandlerOutput } from './handler.interface';
export default async function searx( const SearXEngine = new Engine('SearX', ['searx.*']);
input: HandlerInput
): Promise<IHandlerOutput> { SearXEngine.route('/search?q=:search', async (input, req) => {
const document = input.parseDom().window.document; const document = input.parseDom().window.document;
const search = req.search;
const search = document.getElementById('q') as HTMLTextAreaElement;
const url = new URL(input.getUrl()); const url = new URL(input.getUrl());
const page = parseInt(url.searchParams.get('pageno') || '1'); const page = parseInt(url.searchParams.get('pageno') || '1');
const page_footer = `${ const page_footer = `${
page !== 1 page !== 1
? `<a href="${url.origin}${url.pathname}?q=${search.value}&pageno=${ ? `<a href="${url.origin}${url.pathname}?q=${search}&pageno=${
page - 1 page - 1
}">Previous </a>|` }">Previous </a>|`
: '' : ''
}<a href="${url.origin}${url.pathname}?q=${search.value}&pageno=${ }<a href="${url.origin}${url.pathname}?q=${search}&pageno=${
page + 1 page + 1
}"> Next</a>`; }"> Next</a>`;
const articles = Array.from(document.querySelectorAll('.result')); const articles = Array.from(document.querySelectorAll('.result'));
const articles_parsed = articles.map((a) => { const articles_parsed = articles.map((a) => {
const parsed = { const parsed = {
url: url:
@ -51,9 +46,9 @@ export default async function searx(
return { return {
content, content,
textContent, textContent,
title: `${search.value} - Searx - Page ${page}`, title: `${search} - Searx - Page ${page}`,
lang: document.documentElement.lang, lang: document.documentElement.lang,
}; };
} });
export const SearxDomains = ['searx.*']; export default SearXEngine;

View File

@ -0,0 +1,45 @@
import { Engine } from '../engine';
const SOE = new Engine('StackOverflow', [
'stackoverflow.com',
'*.stackoverflow.com',
'*.stackexchange.com',
'askubuntu.com',
'stackapps.com',
'mathoverflow.net',
'superuser.com',
'serverfault.com',
]);
SOE.route('/questions/:id/:slug', async (input, req) => {
const document = input.parseDom().window.document;
const questionEl = document.getElementById('question');
const question = postParser(questionEl);
const title = document.querySelector('.question-hyperlink')?.innerHTML || '';
const allAnswers = [...document.querySelectorAll('.answer')];
const answers = allAnswers.map((a) => postParser(a));
return {
content: `${question}<hr>${answers.length} answers <hr>${answers.join(
'<hr>'
)}`,
textContent: `${req.id}/${req.slug}\n`,
title,
lang: 'en',
};
});
function postParser(el: Element | null): string {
if (!el) {
return '';
}
const body = el.querySelector('.js-post-body')?.innerHTML || '';
const voteCount = el.querySelector('.js-vote-count')?.textContent || '';
return `<h3>${voteCount} votes</h3>${body}`;
}
export default SOE;

View File

@ -1,75 +0,0 @@
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { EngineParseError } from '../errors/main';
export default async function google(
input: HandlerInput
): Promise<IHandlerOutput> {
const window = input.parseDom().window;
const googleAnchors = [
...window.document.querySelectorAll('a[jsname=UWckNb]'),
] as HTMLAnchorElement[];
if (!googleAnchors) {
throw new EngineParseError(
'Failed to find anchors in search result [google]'
);
}
const results = googleAnchors
.map((a: HTMLAnchorElement): GoogleProps => {
const parsedHref = new URL(new URL(a.href).searchParams.get('url')!);
return {
href: a.href!,
siteName: parsedHref.hostname,
heading: a.childNodes[1]?.textContent,
};
})
.filter((a) => a.heading);
const convertToFormat = (result: GoogleProps, isHtml: boolean) => {
return isHtml
? `<p><a href="${result.href}">${result.siteName} - ${result.heading}</p>`
: `${result.siteName} - ${result.heading} > ${result.href}`;
};
const content = results.map((result) => {
return convertToFormat(result, true);
});
const textContent = results.map((result) => {
return convertToFormat(result, false);
});
const search = window.document.getElementById(
'APjFqb'
) as HTMLTextAreaElement;
const searchForm = `
<form onsubmit="window.location.href = '/get?url=https://www.google.com/search?q='+ document.getElementById('q').value.split(' ').join('+'); return false">
<input type="text" name="q" id="q" value="${search?.value}">
<input type="button" value="Search" onclick="window.location.href = '/get?url=https://www.google.com/search?q='+ document.getElementById('q').value.split(' ').join('+');">
</form>
`;
return {
content: `${searchForm}${content.join('')}`,
textContent: textContent.join('\n'),
};
}
export const GoogleDomains = [
'google.*',
'google.co.*',
'google.com.*',
'www.google.*',
'www.google.co.*',
'www.google.com.*',
];
interface GoogleProps {
href: string;
siteName: string;
heading: string | null;
}

View File

@ -1,97 +1,13 @@
import { IHandlerOutput } from './handler.interface'; import { Distributor } from './distributor';
import { Engines, EngineFunction, EnginesMatch } from '../types/handlers'; import Readability from './engines/readability';
import axios from '../types/axios'; import SearX from './engines/searx';
import StackOverflow from './engines/stackoverflow';
import micromatch from 'micromatch'; const distributor = new Distributor();
import DOMPurify from 'dompurify'; distributor.engine(Readability);
distributor.engine(SearX);
distributor.engine(StackOverflow);
import { Readable } from 'stream'; export const engineList = distributor.list;
export default distributor;
import readability from './readability';
import google, { GoogleDomains } from './google';
import stackoverflow, { StackOverflowDomains } from './stackoverflow/main';
import searx, { SearxDomains } from './searx';
import isLocalResource from '../utils/islocal';
import { LocalResourceError, NotHtmlMimetypeError } from '../errors/main';
import { HandlerInput } from './handler-input';
import { decodeStream, parseEncodingName } from '../utils/http';
import replaceHref from '../utils/replace-href';
import { parseHTML } from 'linkedom';
export default async function handlePage(
remoteUrl: string, // remote URL
requestUrl: URL, // proxy URL
engine?: string,
redirectPath: string = 'get'
): Promise<IHandlerOutput> {
const urlObj = new URL(remoteUrl);
if (await isLocalResource(urlObj)) {
throw new LocalResourceError();
}
const response = await axios.get(remoteUrl);
const data: Readable = response.data;
const mime: string | undefined = response.headers['content-type']?.toString();
if (mime && mime.indexOf('text/html') === -1) {
throw new NotHtmlMimetypeError();
}
const handler = getFallbackEngine(urlObj.hostname, engine);
const output = await handler(
new HandlerInput(
await decodeStream(data, parseEncodingName(mime)),
remoteUrl
)
);
// post-process
const dom = parseHTML(output.content);
replaceHref(dom, requestUrl, new URL(remoteUrl), engine, redirectPath);
const purify = DOMPurify(dom.window);
output.content = purify.sanitize(dom.document.toString());
return output;
}
function getFallbackEngine(host: string, specified?: string): EngineFunction {
if (specified) {
return engines[specified];
}
for (const engine of fallback) {
if (micromatch.isMatch(host, engine.pattern)) {
return engine.engine;
}
}
return engines.readability;
}
export const engines: Engines = {
readability,
google,
stackoverflow,
searx,
};
export const engineList: string[] = Object.keys(engines);
export const fallback: EnginesMatch = [
{
pattern: GoogleDomains,
engine: engines.google,
},
{
pattern: StackOverflowDomains,
engine: engines.stackoverflow,
},
{
pattern: SearxDomains,
engine: engines.searx,
},
];

View File

@ -1,22 +0,0 @@
import { Readability } from '@mozilla/readability';
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { EngineParseError } from '../errors/main';
export default async function readability(
input: HandlerInput
): Promise<IHandlerOutput> {
const reader = new Readability(input.parseDom().window.document);
const parsed = reader.parse();
if (!parsed) {
throw new EngineParseError('Failed to parse [readability]');
}
return {
content: parsed.content,
textContent: parsed.textContent,
title: parsed.title,
lang: parsed.lang,
};
}

View File

@ -1,43 +0,0 @@
import { HandlerInput } from '../handler-input';
import { IHandlerOutput } from '../handler.interface';
import { EngineParseError } from '../../errors/main';
import qPostsHandler from './questions-posts';
export default async function stackoverflow(
input: HandlerInput
): Promise<IHandlerOutput> {
const window = input.parseDom().window;
const url = new URL(window.location.href);
const path = url.pathname.split('/').filter((p) => p !== '');
let result: IHandlerOutput = {
content: '',
textContent: '',
title: '',
lang: '',
};
if (path[0] === 'questions') {
if (path.length === 3) {
result = await qPostsHandler(window);
} else if (path.length === 1) {
result.content = 'questions';
} else {
throw new EngineParseError('Invalid URL [stackoverflow]');
}
}
return result;
}
export const StackOverflowDomains = [
'stackoverflow.com',
'*.stackoverflow.com',
'*.stackexchange.com',
'askubuntu.com',
'stackapps.com',
'mathoverflow.net',
'superuser.com',
'serverfault.com',
];

View File

@ -1,9 +0,0 @@
export default function postParser(el: Element | null): string {
if (!el) {
return '';
}
const body = el.querySelector('.js-post-body')?.innerHTML || '';
const voteCount = el.querySelector('.js-vote-count')?.textContent || '';
return `<h3>${voteCount} votes</h3>${body}`;
}

View File

@ -1,25 +0,0 @@
import { IHandlerOutput } from '../handler.interface';
import postParser from './post-parser';
export default async function qPostsHandler(
window: Window
): Promise<IHandlerOutput> {
const questionEl = window.document.getElementById('question');
const question = postParser(questionEl);
const title =
window.document.querySelector('.question-hyperlink')?.innerHTML || '';
const allAnswers = [...window.document.querySelectorAll('.answer')];
const answers = allAnswers.map((a) => postParser(a));
return {
content: `${question}<hr>${answers.length} answers <hr>${answers.join(
'<hr>'
)}`,
textContent: 'question',
title,
lang: 'en',
};
}

View File

@ -1,5 +1,5 @@
export default { export default {
version: '1.5.2', version: '1.5.3',
description: description:
'txtdot is an HTTP proxy that parses only text, links and pictures from pages reducing internet bandwidth usage, removing ads and heavy scripts', 'txtdot is an HTTP proxy that parses only text, links and pictures from pages reducing internet bandwidth usage, removing ads and heavy scripts',
}; };

View File

@ -6,7 +6,7 @@ import {
parseSchema, parseSchema,
} from '../../types/requests/api'; } from '../../types/requests/api';
import handlePage from '../../handlers/main'; import distributor from '../../handlers/main';
import { generateRequestUrl } from '../../utils/generate'; import { generateRequestUrl } from '../../utils/generate';
export default async function parseRoute(fastify: FastifyInstance) { export default async function parseRoute(fastify: FastifyInstance) {
@ -15,7 +15,7 @@ export default async function parseRoute(fastify: FastifyInstance) {
{ schema: parseSchema }, { schema: parseSchema },
async (request: EngineRequest) => { async (request: EngineRequest) => {
return { return {
data: await handlePage( data: await distributor.handlePage(
request.query.url, request.query.url,
generateRequestUrl( generateRequestUrl(
request.protocol, request.protocol,

View File

@ -2,7 +2,7 @@ import { FastifyInstance } from 'fastify';
import { IParseSchema, rawHtmlSchema } from '../../types/requests/api'; import { IParseSchema, rawHtmlSchema } from '../../types/requests/api';
import handlePage from '../../handlers/main'; import distributor from '../../handlers/main';
import { generateRequestUrl } from '../../utils/generate'; import { generateRequestUrl } from '../../utils/generate';
export default async function rawHtml(fastify: FastifyInstance) { export default async function rawHtml(fastify: FastifyInstance) {
@ -12,7 +12,7 @@ export default async function rawHtml(fastify: FastifyInstance) {
async (request, reply) => { async (request, reply) => {
reply.type('text/html; charset=utf-8'); reply.type('text/html; charset=utf-8');
return ( return (
await handlePage( await distributor.handlePage(
request.query.url, request.query.url,
generateRequestUrl( generateRequestUrl(
request.protocol, request.protocol,

View File

@ -1,7 +1,7 @@
import { FastifyInstance } from 'fastify'; import { FastifyInstance } from 'fastify';
import { GetSchema, IGetSchema } from '../../types/requests/browser'; import { GetSchema, IGetSchema } from '../../types/requests/browser';
import handlePage from '../../handlers/main'; import distributor from '../../handlers/main';
import { generateRequestUrl } from '../../utils/generate'; import { generateRequestUrl } from '../../utils/generate';
import getConfig from '../../config/main'; import getConfig from '../../config/main';
@ -14,7 +14,7 @@ export default async function getRoute(fastify: FastifyInstance) {
const remoteUrl = request.query.url; const remoteUrl = request.query.url;
const engine = request.query.engine; const engine = request.query.engine;
const parsed = await handlePage( const parsed = await distributor.handlePage(
remoteUrl, remoteUrl,
generateRequestUrl( generateRequestUrl(
request.protocol, request.protocol,

View File

@ -1,8 +1,9 @@
import { Engine } from '../handlers/engine';
import { HandlerInput } from '../handlers/handler-input'; import { HandlerInput } from '../handlers/handler-input';
import { IHandlerOutput } from '../handlers/handler.interface'; import { IHandlerOutput } from '../handlers/handler.interface';
export interface Engines { export interface Engines {
[key: string]: EngineFunction; [key: string]: Engine;
} }
export type EngineMatch = { export type EngineMatch = {
@ -10,5 +11,12 @@ export type EngineMatch = {
engine: EngineFunction; engine: EngineFunction;
}; };
export type EngineFunction = (input: HandlerInput) => Promise<IHandlerOutput>; export interface RouteValues {
[key: string]: string;
}
export type EngineFunction = (
input: HandlerInput,
req: RouteValues
) => Promise<IHandlerOutput>;
export type EnginesMatch = EngineMatch[]; export type EnginesMatch = EngineMatch[];