Engine routing (#82)

* refactor: move engines to folder

* refactor: engine class

* refactor: add distributor and readability, searx engines class

* delete: google engine

useless since searx added

* fix: stackoverflow

* update version
This commit is contained in:
Artemy Egorov 2024-02-17 15:25:54 +03:00 committed by GitHub
parent 92ab68c587
commit c9f9e48acb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 256 additions and 297 deletions

22
package-lock.json generated
View File

@ -1,12 +1,12 @@
{
"name": "txtdot",
"version": "1.5.2",
"version": "1.5.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "txtdot",
"version": "1.5.2",
"version": "1.5.3",
"license": "MIT",
"dependencies": {
"@fastify/static": "^6.12.0",
@ -23,7 +23,8 @@
"ip-range-check": "^0.2.0",
"json-schema-to-ts": "^3.0.0",
"linkedom": "^0.16.6",
"micromatch": "^4.0.5"
"micromatch": "^4.0.5",
"route-parser": "^0.0.5"
},
"devDependencies": {
"@types/dompurify": "^3.0.5",
@ -31,6 +32,7 @@
"@types/jsdom": "^21.1.6",
"@types/micromatch": "^4.0.6",
"@types/node": "^20.10.6",
"@types/route-parser": "^0.1.7",
"@typescript-eslint/eslint-plugin": "^6.18.0",
"@typescript-eslint/parser": "^6.18.0",
"clean-css-cli": "^5.6.3",
@ -414,6 +416,12 @@
"undici-types": "~5.26.4"
}
},
"node_modules/@types/route-parser": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/@types/route-parser/-/route-parser-0.1.7.tgz",
"integrity": "sha512-haO+3HVio/4w+yuMJTjqfSo0ivOV8WnXaOReVD6QN729UGBEyizWNGc2Jd0OLsJDucIod4aJSsPLBeLj2uzMCQ==",
"dev": true
},
"node_modules/@types/semver": {
"version": "7.5.6",
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.6.tgz",
@ -2964,6 +2972,14 @@
"node": "*"
}
},
"node_modules/route-parser": {
"version": "0.0.5",
"resolved": "https://registry.npmjs.org/route-parser/-/route-parser-0.0.5.tgz",
"integrity": "sha512-nsii+MXoNb7NyF05LP9kaktx6AoBVT/7zUgDnzIb5IoYAvYkbZOAuoLJjVdsyEVxWv0swCxWkKDK4cMva+WDBA==",
"engines": {
"node": ">= 0.9"
}
},
"node_modules/run-parallel": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",

View File

@ -1,6 +1,6 @@
{
"name": "txtdot",
"version": "1.5.2",
"version": "1.5.3",
"private": true,
"description": "",
"main": "dist/app.js",
@ -19,7 +19,8 @@
"ip-range-check": "^0.2.0",
"json-schema-to-ts": "^3.0.0",
"linkedom": "^0.16.6",
"micromatch": "^4.0.5"
"micromatch": "^4.0.5",
"route-parser": "^0.0.5"
},
"devDependencies": {
"@types/dompurify": "^3.0.5",
@ -27,6 +28,7 @@
"@types/jsdom": "^21.1.6",
"@types/micromatch": "^4.0.6",
"@types/node": "^20.10.6",
"@types/route-parser": "^0.1.7",
"@typescript-eslint/eslint-plugin": "^6.18.0",
"@typescript-eslint/parser": "^6.18.0",
"clean-css-cli": "^5.6.3",

View File

@ -0,0 +1,87 @@
import { IHandlerOutput } from './handler.interface';
import axios from '../types/axios';
import micromatch from 'micromatch';
import DOMPurify from 'dompurify';
import { Readable } from 'stream';
import isLocalResource from '../utils/islocal';
import { LocalResourceError, NotHtmlMimetypeError } from '../errors/main';
import { HandlerInput } from './handler-input';
import { decodeStream, parseEncodingName } from '../utils/http';
import replaceHref from '../utils/replace-href';
import { parseHTML } from 'linkedom';
import { Engine } from './engine';
interface IEngineId {
[key: string]: number;
}
export class Distributor {
engines_id: IEngineId = {};
fallback: Engine[] = [];
list: string[] = [];
constructor() {}
engine(engine: Engine) {
this.engines_id[engine.name] = this.list.length;
this.fallback.push(engine);
this.list.push(engine.name);
}
async handlePage(
remoteUrl: string, // remote URL
requestUrl: URL, // proxy URL
engineName?: string,
redirectPath: string = 'get'
): Promise<IHandlerOutput> {
const urlObj = new URL(remoteUrl);
if (await isLocalResource(urlObj)) {
throw new LocalResourceError();
}
const response = await axios.get(remoteUrl);
const data: Readable = response.data;
const mime: string | undefined =
response.headers['content-type']?.toString();
if (mime && mime.indexOf('text/html') === -1) {
throw new NotHtmlMimetypeError();
}
const engine = this.getFallbackEngine(urlObj.hostname, engineName);
const output = await engine.handle(
new HandlerInput(
await decodeStream(data, parseEncodingName(mime)),
remoteUrl
)
);
// post-process
const dom = parseHTML(output.content);
replaceHref(dom, requestUrl, new URL(remoteUrl), engineName, redirectPath);
const purify = DOMPurify(dom.window);
output.content = purify.sanitize(dom.document.toString());
return output;
}
getFallbackEngine(host: string, specified?: string): Engine {
if (specified) {
return this.fallback[this.engines_id[specified]];
}
for (const engine of this.fallback) {
if (micromatch.isMatch(host, engine.domains)) {
return engine;
}
}
return this.fallback[0];
}
}

38
src/handlers/engine.ts Normal file
View File

@ -0,0 +1,38 @@
import Route from 'route-parser';
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { EngineParseError } from '../errors/main';
import { EngineFunction } from '../types/handlers';
interface IRoute {
route: Route;
handler: EngineFunction;
}
export class Engine {
name: string;
domains: string[];
routes: IRoute[] = [];
constructor(name: string, domains: string[] = []) {
this.domains = domains;
this.name = name;
}
route(path: string, handler: EngineFunction) {
this.routes.push({ route: new Route(path), handler: handler });
}
async handle(input: HandlerInput): Promise<IHandlerOutput> {
const url = new URL(input.getUrl());
const path = url.pathname + url.search + url.hash;
for (const route of this.routes) {
const match = route.route.match(path);
if (match) {
return await route.handler(input, match);
}
}
throw new EngineParseError(`No handler for ${path}. [${this.name}]`);
}
}

View File

@ -0,0 +1,26 @@
import { Readability } from '@mozilla/readability';
import { EngineParseError } from '../../errors/main';
import { Engine } from '../engine';
const ReadabilityEngine = new Engine('Readability');
ReadabilityEngine.route('*path', async (input, req) => {
const reader = new Readability(input.parseDom().window.document);
const parsed = reader.parse();
if (!parsed) {
throw new EngineParseError(
`Parse error (${req.path}). [${ReadabilityEngine.name}]`
);
}
return {
content: parsed.content,
textContent: parsed.textContent,
title: parsed.title,
lang: parsed.lang,
};
});
export default ReadabilityEngine;

View File

@ -1,29 +1,24 @@
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { Engine } from '../engine';
export default async function searx(
input: HandlerInput
): Promise<IHandlerOutput> {
const SearXEngine = new Engine('SearX', ['searx.*']);
SearXEngine.route('/search?q=:search', async (input, req) => {
const document = input.parseDom().window.document;
const search = document.getElementById('q') as HTMLTextAreaElement;
const search = req.search;
const url = new URL(input.getUrl());
const page = parseInt(url.searchParams.get('pageno') || '1');
const page_footer = `${
page !== 1
? `<a href="${url.origin}${url.pathname}?q=${search.value}&pageno=${
? `<a href="${url.origin}${url.pathname}?q=${search}&pageno=${
page - 1
}">Previous </a>|`
: ''
}<a href="${url.origin}${url.pathname}?q=${search.value}&pageno=${
}<a href="${url.origin}${url.pathname}?q=${search}&pageno=${
page + 1
}"> Next</a>`;
const articles = Array.from(document.querySelectorAll('.result'));
const articles_parsed = articles.map((a) => {
const parsed = {
url:
@ -51,9 +46,9 @@ export default async function searx(
return {
content,
textContent,
title: `${search.value} - Searx - Page ${page}`,
title: `${search} - Searx - Page ${page}`,
lang: document.documentElement.lang,
};
}
});
export const SearxDomains = ['searx.*'];
export default SearXEngine;

View File

@ -0,0 +1,45 @@
import { Engine } from '../engine';
const SOE = new Engine('StackOverflow', [
'stackoverflow.com',
'*.stackoverflow.com',
'*.stackexchange.com',
'askubuntu.com',
'stackapps.com',
'mathoverflow.net',
'superuser.com',
'serverfault.com',
]);
SOE.route('/questions/:id/:slug', async (input, req) => {
const document = input.parseDom().window.document;
const questionEl = document.getElementById('question');
const question = postParser(questionEl);
const title = document.querySelector('.question-hyperlink')?.innerHTML || '';
const allAnswers = [...document.querySelectorAll('.answer')];
const answers = allAnswers.map((a) => postParser(a));
return {
content: `${question}<hr>${answers.length} answers <hr>${answers.join(
'<hr>'
)}`,
textContent: `${req.id}/${req.slug}\n`,
title,
lang: 'en',
};
});
function postParser(el: Element | null): string {
if (!el) {
return '';
}
const body = el.querySelector('.js-post-body')?.innerHTML || '';
const voteCount = el.querySelector('.js-vote-count')?.textContent || '';
return `<h3>${voteCount} votes</h3>${body}`;
}
export default SOE;

View File

@ -1,75 +0,0 @@
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { EngineParseError } from '../errors/main';
export default async function google(
input: HandlerInput
): Promise<IHandlerOutput> {
const window = input.parseDom().window;
const googleAnchors = [
...window.document.querySelectorAll('a[jsname=UWckNb]'),
] as HTMLAnchorElement[];
if (!googleAnchors) {
throw new EngineParseError(
'Failed to find anchors in search result [google]'
);
}
const results = googleAnchors
.map((a: HTMLAnchorElement): GoogleProps => {
const parsedHref = new URL(new URL(a.href).searchParams.get('url')!);
return {
href: a.href!,
siteName: parsedHref.hostname,
heading: a.childNodes[1]?.textContent,
};
})
.filter((a) => a.heading);
const convertToFormat = (result: GoogleProps, isHtml: boolean) => {
return isHtml
? `<p><a href="${result.href}">${result.siteName} - ${result.heading}</p>`
: `${result.siteName} - ${result.heading} > ${result.href}`;
};
const content = results.map((result) => {
return convertToFormat(result, true);
});
const textContent = results.map((result) => {
return convertToFormat(result, false);
});
const search = window.document.getElementById(
'APjFqb'
) as HTMLTextAreaElement;
const searchForm = `
<form onsubmit="window.location.href = '/get?url=https://www.google.com/search?q='+ document.getElementById('q').value.split(' ').join('+'); return false">
<input type="text" name="q" id="q" value="${search?.value}">
<input type="button" value="Search" onclick="window.location.href = '/get?url=https://www.google.com/search?q='+ document.getElementById('q').value.split(' ').join('+');">
</form>
`;
return {
content: `${searchForm}${content.join('')}`,
textContent: textContent.join('\n'),
};
}
export const GoogleDomains = [
'google.*',
'google.co.*',
'google.com.*',
'www.google.*',
'www.google.co.*',
'www.google.com.*',
];
interface GoogleProps {
href: string;
siteName: string;
heading: string | null;
}

View File

@ -1,97 +1,13 @@
import { IHandlerOutput } from './handler.interface';
import { Engines, EngineFunction, EnginesMatch } from '../types/handlers';
import axios from '../types/axios';
import { Distributor } from './distributor';
import Readability from './engines/readability';
import SearX from './engines/searx';
import StackOverflow from './engines/stackoverflow';
import micromatch from 'micromatch';
const distributor = new Distributor();
import DOMPurify from 'dompurify';
distributor.engine(Readability);
distributor.engine(SearX);
distributor.engine(StackOverflow);
import { Readable } from 'stream';
import readability from './readability';
import google, { GoogleDomains } from './google';
import stackoverflow, { StackOverflowDomains } from './stackoverflow/main';
import searx, { SearxDomains } from './searx';
import isLocalResource from '../utils/islocal';
import { LocalResourceError, NotHtmlMimetypeError } from '../errors/main';
import { HandlerInput } from './handler-input';
import { decodeStream, parseEncodingName } from '../utils/http';
import replaceHref from '../utils/replace-href';
import { parseHTML } from 'linkedom';
export default async function handlePage(
remoteUrl: string, // remote URL
requestUrl: URL, // proxy URL
engine?: string,
redirectPath: string = 'get'
): Promise<IHandlerOutput> {
const urlObj = new URL(remoteUrl);
if (await isLocalResource(urlObj)) {
throw new LocalResourceError();
}
const response = await axios.get(remoteUrl);
const data: Readable = response.data;
const mime: string | undefined = response.headers['content-type']?.toString();
if (mime && mime.indexOf('text/html') === -1) {
throw new NotHtmlMimetypeError();
}
const handler = getFallbackEngine(urlObj.hostname, engine);
const output = await handler(
new HandlerInput(
await decodeStream(data, parseEncodingName(mime)),
remoteUrl
)
);
// post-process
const dom = parseHTML(output.content);
replaceHref(dom, requestUrl, new URL(remoteUrl), engine, redirectPath);
const purify = DOMPurify(dom.window);
output.content = purify.sanitize(dom.document.toString());
return output;
}
function getFallbackEngine(host: string, specified?: string): EngineFunction {
if (specified) {
return engines[specified];
}
for (const engine of fallback) {
if (micromatch.isMatch(host, engine.pattern)) {
return engine.engine;
}
}
return engines.readability;
}
export const engines: Engines = {
readability,
google,
stackoverflow,
searx,
};
export const engineList: string[] = Object.keys(engines);
export const fallback: EnginesMatch = [
{
pattern: GoogleDomains,
engine: engines.google,
},
{
pattern: StackOverflowDomains,
engine: engines.stackoverflow,
},
{
pattern: SearxDomains,
engine: engines.searx,
},
];
export const engineList = distributor.list;
export default distributor;

View File

@ -1,22 +0,0 @@
import { Readability } from '@mozilla/readability';
import { HandlerInput } from './handler-input';
import { IHandlerOutput } from './handler.interface';
import { EngineParseError } from '../errors/main';
export default async function readability(
input: HandlerInput
): Promise<IHandlerOutput> {
const reader = new Readability(input.parseDom().window.document);
const parsed = reader.parse();
if (!parsed) {
throw new EngineParseError('Failed to parse [readability]');
}
return {
content: parsed.content,
textContent: parsed.textContent,
title: parsed.title,
lang: parsed.lang,
};
}

View File

@ -1,43 +0,0 @@
import { HandlerInput } from '../handler-input';
import { IHandlerOutput } from '../handler.interface';
import { EngineParseError } from '../../errors/main';
import qPostsHandler from './questions-posts';
export default async function stackoverflow(
input: HandlerInput
): Promise<IHandlerOutput> {
const window = input.parseDom().window;
const url = new URL(window.location.href);
const path = url.pathname.split('/').filter((p) => p !== '');
let result: IHandlerOutput = {
content: '',
textContent: '',
title: '',
lang: '',
};
if (path[0] === 'questions') {
if (path.length === 3) {
result = await qPostsHandler(window);
} else if (path.length === 1) {
result.content = 'questions';
} else {
throw new EngineParseError('Invalid URL [stackoverflow]');
}
}
return result;
}
export const StackOverflowDomains = [
'stackoverflow.com',
'*.stackoverflow.com',
'*.stackexchange.com',
'askubuntu.com',
'stackapps.com',
'mathoverflow.net',
'superuser.com',
'serverfault.com',
];

View File

@ -1,9 +0,0 @@
export default function postParser(el: Element | null): string {
if (!el) {
return '';
}
const body = el.querySelector('.js-post-body')?.innerHTML || '';
const voteCount = el.querySelector('.js-vote-count')?.textContent || '';
return `<h3>${voteCount} votes</h3>${body}`;
}

View File

@ -1,25 +0,0 @@
import { IHandlerOutput } from '../handler.interface';
import postParser from './post-parser';
export default async function qPostsHandler(
window: Window
): Promise<IHandlerOutput> {
const questionEl = window.document.getElementById('question');
const question = postParser(questionEl);
const title =
window.document.querySelector('.question-hyperlink')?.innerHTML || '';
const allAnswers = [...window.document.querySelectorAll('.answer')];
const answers = allAnswers.map((a) => postParser(a));
return {
content: `${question}<hr>${answers.length} answers <hr>${answers.join(
'<hr>'
)}`,
textContent: 'question',
title,
lang: 'en',
};
}

View File

@ -1,5 +1,5 @@
export default {
version: '1.5.2',
version: '1.5.3',
description:
'txtdot is an HTTP proxy that parses only text, links and pictures from pages reducing internet bandwidth usage, removing ads and heavy scripts',
};

View File

@ -6,7 +6,7 @@ import {
parseSchema,
} from '../../types/requests/api';
import handlePage from '../../handlers/main';
import distributor from '../../handlers/main';
import { generateRequestUrl } from '../../utils/generate';
export default async function parseRoute(fastify: FastifyInstance) {
@ -15,7 +15,7 @@ export default async function parseRoute(fastify: FastifyInstance) {
{ schema: parseSchema },
async (request: EngineRequest) => {
return {
data: await handlePage(
data: await distributor.handlePage(
request.query.url,
generateRequestUrl(
request.protocol,

View File

@ -2,7 +2,7 @@ import { FastifyInstance } from 'fastify';
import { IParseSchema, rawHtmlSchema } from '../../types/requests/api';
import handlePage from '../../handlers/main';
import distributor from '../../handlers/main';
import { generateRequestUrl } from '../../utils/generate';
export default async function rawHtml(fastify: FastifyInstance) {
@ -12,7 +12,7 @@ export default async function rawHtml(fastify: FastifyInstance) {
async (request, reply) => {
reply.type('text/html; charset=utf-8');
return (
await handlePage(
await distributor.handlePage(
request.query.url,
generateRequestUrl(
request.protocol,

View File

@ -1,7 +1,7 @@
import { FastifyInstance } from 'fastify';
import { GetSchema, IGetSchema } from '../../types/requests/browser';
import handlePage from '../../handlers/main';
import distributor from '../../handlers/main';
import { generateRequestUrl } from '../../utils/generate';
import getConfig from '../../config/main';
@ -14,7 +14,7 @@ export default async function getRoute(fastify: FastifyInstance) {
const remoteUrl = request.query.url;
const engine = request.query.engine;
const parsed = await handlePage(
const parsed = await distributor.handlePage(
remoteUrl,
generateRequestUrl(
request.protocol,

View File

@ -1,8 +1,9 @@
import { Engine } from '../handlers/engine';
import { HandlerInput } from '../handlers/handler-input';
import { IHandlerOutput } from '../handlers/handler.interface';
export interface Engines {
[key: string]: EngineFunction;
[key: string]: Engine;
}
export type EngineMatch = {
@ -10,5 +11,12 @@ export type EngineMatch = {
engine: EngineFunction;
};
export type EngineFunction = (input: HandlerInput) => Promise<IHandlerOutput>;
export interface RouteValues {
[key: string]: string;
}
export type EngineFunction = (
input: HandlerInput,
req: RouteValues
) => Promise<IHandlerOutput>;
export type EnginesMatch = EngineMatch[];