refactor: engine output

This commit is contained in:
Artemy 2024-05-13 13:30:47 +03:00
parent 494d1c8134
commit bdf625bb1f
13 changed files with 97 additions and 66 deletions

View File

@ -1,6 +1,6 @@
{
"name": "@txtdot/plugins",
"version": "1.1.1",
"version": "2.0.0",
"description": "Official txtdot plugins",
"main": "dist/lib.js",
"types": "dist/lib.d.ts",
@ -19,6 +19,7 @@
"license": "MIT",
"dependencies": {
"@mozilla/readability": "^0.5.0",
"linkedom": "^0.18.0",
"@txtdot/sdk": "workspace:*"
},
"devDependencies": {

View File

@ -1,6 +1,7 @@
import { Readability as OReadability } from '@mozilla/readability';
import { Engine, EngineParseError } from '@txtdot/sdk';
import { parseHTML } from 'linkedom';
const Readability = new Engine(
'Readability',
@ -9,7 +10,7 @@ const Readability = new Engine(
);
Readability.route('*path', async (input, ro) => {
const reader = new OReadability(input.parseDom().window.document);
const reader = new OReadability(input.document);
const parsed = reader.parse();
if (!parsed) {
@ -17,7 +18,7 @@ Readability.route('*path', async (input, ro) => {
}
return {
content: parsed.content,
document: parseHTML(parsed.content).document,
textContent: parsed.textContent,
title: parsed.title,
lang: parsed.lang,

View File

@ -1,5 +1,6 @@
import { Engine } from '@txtdot/sdk';
import { HandlerInput, Route } from '@txtdot/sdk';
import { parseHTML } from 'linkedom';
const SearX = new Engine('SearX', "Engine for searching with 'SearXNG'", [
'searx.*',
@ -9,7 +10,7 @@ async function search(
input: HandlerInput,
ro: Route<{ search: string; pageno?: string }>
) {
const document = input.parseDom().window.document;
const document = input.document;
const search = ro.q.search;
const page = parseInt(ro.q.pageno || '1');
@ -45,7 +46,7 @@ async function search(
const textContent = articles_parsed.map((a) => a.text).join('');
return {
content,
document: parseHTML(content).document,
textContent,
title: `${search} - Searx - Page ${page}`,
lang: document.documentElement.lang,

View File

@ -1,10 +1,11 @@
import { HandlerInput, Route } from '@txtdot/sdk';
import { parseHTML } from 'linkedom';
async function questions(
input: HandlerInput,
ro: Route<{ id: string; slug: string }>
) {
const document = input.parseDom().window.document;
const document = input.document;
const questionEl = document.getElementById('question');
const question = postParser(questionEl);
@ -15,9 +16,9 @@ async function questions(
const answers = allAnswers.map((a) => postParser(a));
return {
content: `${question}<hr>${answers.length} answers <hr>${answers.join(
'<hr>'
)}`,
document: parseHTML(
`${question}<hr>${answers.length} answers <hr>${answers.join('<hr>')}`
).document,
textContent: `${ro.q.id}/${ro.q.slug}\nText output not supported`, // TODO
title,
lang: document.documentElement.lang,

View File

@ -1,10 +1,11 @@
import { HandlerInput, Route } from '@txtdot/sdk';
import { parseHTML } from 'linkedom';
async function users(
input: HandlerInput,
ro: Route<{ id: string; slug: string }>
) {
const document = input.parseDom().window.document;
const document = input.document;
const userInfo =
document.querySelector('.md\\:ai-start > div:nth-child(2)')?.textContent ||
@ -26,7 +27,8 @@ async function users(
.join('<br/>');
return {
content: `${userInfo}<hr><h3>Top Posts</h3>${topPosts}`,
document: parseHTML(`${userInfo}<hr><h3>Top Posts</h3>${topPosts}`)
.document,
textContent: `${ro.q.id}/${ro.q.slug}\n`, // TODO
title: document.querySelector('title')?.textContent || '',
lang: document.documentElement.lang,

View File

@ -1,6 +1,6 @@
{
"name": "@txtdot/sdk",
"version": "1.1.2",
"version": "2.0.0",
"description": "SDK for creating plugins for TxtDot",
"main": "dist/lib.js",
"types": "dist/lib.d.ts",

View File

@ -2,9 +2,9 @@ import Route from 'route-parser';
import {
HandlerInput,
IHandlerOutput,
EngineFunction,
RouteValues,
EngineOutput,
} from './types/handler';
import { NoHandlerFoundError } from './types/errors';
@ -33,7 +33,7 @@ export class Engine {
this.routes.push({ route: new Route<TParams>(path), handler });
}
async handle(input: HandlerInput): Promise<IHandlerOutput> {
async handle(input: HandlerInput): Promise<EngineOutput> {
const url = new URL(input.getUrl());
const path = url.pathname + url.search + url.hash;
for (const route of this.routes) {

View File

@ -13,7 +13,7 @@ import {
RouteValues,
EnginesMatch,
HandlerInput,
IHandlerOutput,
HandlerOutput,
Route,
handlerSchema,
} from './types/handler';
@ -29,7 +29,7 @@ export {
RouteValues,
EnginesMatch,
HandlerInput,
IHandlerOutput,
HandlerOutput,
Route,
handlerSchema,
};

View File

@ -4,7 +4,7 @@ import { Engine } from '../engine';
export class HandlerInput {
private data: string;
private url: string;
private dom?: Window;
private window?: Window;
constructor(data: string, url: string) {
this.data = data;
@ -15,23 +15,30 @@ export class HandlerInput {
return this.url;
}
parseDom(): Window {
if (this.dom) {
return this.dom;
get document(): Document {
if (this.window) {
return this.window.document;
}
this.dom = parseHTML(this.data);
return this.dom;
this.window = parseHTML(this.data);
return this.window.document;
}
}
export interface IHandlerOutput {
export interface HandlerOutput {
content: string;
textContent: string;
title?: string;
lang?: string;
}
export interface EngineOutput {
document: Document;
textContent?: string;
title?: string;
lang?: string;
}
export const handlerSchema = {
type: 'object',
properties: {
@ -66,7 +73,7 @@ export interface RouteValues {
export type EngineFunction<TParams extends RouteValues> = (
input: HandlerInput,
ro: Route<TParams>
) => Promise<IHandlerOutput>;
) => Promise<EngineOutput>;
export type EnginesMatch<TParams extends RouteValues> = EngineMatch<TParams>[];

View File

@ -1,6 +1,6 @@
{
"name": "@txtdot/server",
"version": "1.7.0",
"version": "1.8.0",
"private": true,
"description": "txtdot is an HTTP proxy that parses only text, links and pictures from pages reducing internet bandwidth usage, removing ads and heavy scripts",
"main": "dist/app.js",

View File

@ -5,10 +5,9 @@ import { Readable } from 'stream';
import { NotHtmlMimetypeError } from './errors/main';
import { decodeStream, parseEncodingName } from './utils/http';
import replaceHref from './utils/replace-href';
import { parseHTML } from 'linkedom';
import { Engine } from '@txtdot/sdk';
import { HandlerInput, IHandlerOutput } from '@txtdot/sdk';
import { HandlerInput, HandlerOutput } from '@txtdot/sdk';
import config from './config';
interface IEngineId {
@ -32,7 +31,7 @@ export class Distributor {
requestUrl: URL, // proxy URL
engineName?: string,
redirectPath: string = 'get'
): Promise<IHandlerOutput> {
): Promise<HandlerOutput> {
const urlObj = new URL(remoteUrl);
const webder_url = config.env.third_party.webder_url;
@ -61,13 +60,23 @@ export class Distributor {
// post-process
// TODO: generate dom in handler and not parse here twice
const dom = parseHTML(output.content);
replaceHref(dom, requestUrl, new URL(remoteUrl), engineName, redirectPath);
replaceHref(
output.document,
requestUrl,
new URL(remoteUrl),
engineName,
redirectPath
);
const purify = DOMPurify(dom.window);
output.content = purify.sanitize(dom.document.toString());
const purify = DOMPurify();
const content = purify.sanitize(output.document.toString());
return output;
return {
content,
textContent: output.textContent || output.document.textContent || '',
title: output.title,
lang: output.lang,
};
}
getFallbackEngine(host: string, specified?: string): Engine {

View File

@ -2,13 +2,12 @@ import config from '../config';
import { generateParserUrl, generateProxyUrl } from './generate';
export default function replaceHref(
dom: Window,
doc: Document,
requestUrl: URL,
remoteUrl: URL,
engine?: string,
redirectPath: string = 'get'
) {
const doc: Document = dom.window.document;
const parserUrl = (href: string) =>
generateParserUrl(requestUrl, remoteUrl, href, engine, redirectPath);

View File

@ -13,7 +13,7 @@ importers:
version: 20.12.11
'@typescript-eslint/eslint-plugin':
specifier: ^7.7.0
version: 7.8.0(@typescript-eslint/parser@7.8.0)(eslint@8.57.0)(typescript@5.4.5)
version: 7.8.0(@typescript-eslint/parser@7.8.0(eslint@8.57.0)(typescript@5.4.5))(eslint@8.57.0)(typescript@5.4.5)
'@typescript-eslint/parser':
specifier: ^7.7.0
version: 7.8.0(eslint@8.57.0)(typescript@5.4.5)
@ -22,7 +22,7 @@ importers:
version: 8.57.0
lerna:
specifier: ^8.1.2
version: 8.1.2
version: 8.1.2(encoding@0.1.13)
prettier:
specifier: ^3.1.1
version: 3.2.5
@ -41,6 +41,9 @@ importers:
'@txtdot/sdk':
specifier: workspace:*
version: link:../sdk
linkedom:
specifier: ^0.18.0
version: 0.18.0
devDependencies:
typescript:
specifier: ^5.4.5
@ -3378,12 +3381,12 @@ snapshots:
dependencies:
'@sinclair/typebox': 0.27.8
'@lerna/create@8.1.2(typescript@5.4.5)':
'@lerna/create@8.1.2(encoding@0.1.13)(typescript@5.4.5)':
dependencies:
'@npmcli/run-script': 7.0.2
'@nx/devkit': 18.3.4(nx@18.3.4)
'@octokit/plugin-enterprise-rest': 6.0.1
'@octokit/rest': 19.0.11
'@octokit/rest': 19.0.11(encoding@0.1.13)
byte-size: 8.1.1
chalk: 4.1.0
clone-deep: 4.0.1
@ -3413,7 +3416,7 @@ snapshots:
make-dir: 4.0.0
minimatch: 3.0.5
multimatch: 5.0.0
node-fetch: 2.6.7
node-fetch: 2.6.7(encoding@0.1.13)
npm-package-arg: 8.1.1
npm-packlist: 5.1.1
npm-registry-fetch: 14.0.5
@ -3578,11 +3581,11 @@ snapshots:
'@octokit/auth-token@3.0.4': {}
'@octokit/core@4.2.4':
'@octokit/core@4.2.4(encoding@0.1.13)':
dependencies:
'@octokit/auth-token': 3.0.4
'@octokit/graphql': 5.0.6
'@octokit/request': 6.2.8
'@octokit/graphql': 5.0.6(encoding@0.1.13)
'@octokit/request': 6.2.8(encoding@0.1.13)
'@octokit/request-error': 3.0.3
'@octokit/types': 9.3.2
before-after-hook: 2.2.3
@ -3596,9 +3599,9 @@ snapshots:
is-plain-object: 5.0.0
universal-user-agent: 6.0.1
'@octokit/graphql@5.0.6':
'@octokit/graphql@5.0.6(encoding@0.1.13)':
dependencies:
'@octokit/request': 6.2.8
'@octokit/request': 6.2.8(encoding@0.1.13)
'@octokit/types': 9.3.2
universal-user-agent: 6.0.1
transitivePeerDependencies:
@ -3608,19 +3611,19 @@ snapshots:
'@octokit/plugin-enterprise-rest@6.0.1': {}
'@octokit/plugin-paginate-rest@6.1.2(@octokit/core@4.2.4)':
'@octokit/plugin-paginate-rest@6.1.2(@octokit/core@4.2.4(encoding@0.1.13))':
dependencies:
'@octokit/core': 4.2.4
'@octokit/core': 4.2.4(encoding@0.1.13)
'@octokit/tsconfig': 1.0.2
'@octokit/types': 9.3.2
'@octokit/plugin-request-log@1.0.4(@octokit/core@4.2.4)':
'@octokit/plugin-request-log@1.0.4(@octokit/core@4.2.4(encoding@0.1.13))':
dependencies:
'@octokit/core': 4.2.4
'@octokit/core': 4.2.4(encoding@0.1.13)
'@octokit/plugin-rest-endpoint-methods@7.2.3(@octokit/core@4.2.4)':
'@octokit/plugin-rest-endpoint-methods@7.2.3(@octokit/core@4.2.4(encoding@0.1.13))':
dependencies:
'@octokit/core': 4.2.4
'@octokit/core': 4.2.4(encoding@0.1.13)
'@octokit/types': 10.0.0
'@octokit/request-error@3.0.3':
@ -3629,23 +3632,23 @@ snapshots:
deprecation: 2.3.1
once: 1.4.0
'@octokit/request@6.2.8':
'@octokit/request@6.2.8(encoding@0.1.13)':
dependencies:
'@octokit/endpoint': 7.0.6
'@octokit/request-error': 3.0.3
'@octokit/types': 9.3.2
is-plain-object: 5.0.0
node-fetch: 2.6.7
node-fetch: 2.6.7(encoding@0.1.13)
universal-user-agent: 6.0.1
transitivePeerDependencies:
- encoding
'@octokit/rest@19.0.11':
'@octokit/rest@19.0.11(encoding@0.1.13)':
dependencies:
'@octokit/core': 4.2.4
'@octokit/plugin-paginate-rest': 6.1.2(@octokit/core@4.2.4)
'@octokit/plugin-request-log': 1.0.4(@octokit/core@4.2.4)
'@octokit/plugin-rest-endpoint-methods': 7.2.3(@octokit/core@4.2.4)
'@octokit/core': 4.2.4(encoding@0.1.13)
'@octokit/plugin-paginate-rest': 6.1.2(@octokit/core@4.2.4(encoding@0.1.13))
'@octokit/plugin-request-log': 1.0.4(@octokit/core@4.2.4(encoding@0.1.13))
'@octokit/plugin-rest-endpoint-methods': 7.2.3(@octokit/core@4.2.4(encoding@0.1.13))
transitivePeerDependencies:
- encoding
@ -3771,7 +3774,7 @@ snapshots:
'@types/trusted-types@2.0.7': {}
'@typescript-eslint/eslint-plugin@7.8.0(@typescript-eslint/parser@7.8.0)(eslint@8.57.0)(typescript@5.4.5)':
'@typescript-eslint/eslint-plugin@7.8.0(@typescript-eslint/parser@7.8.0(eslint@8.57.0)(typescript@5.4.5))(eslint@8.57.0)(typescript@5.4.5)':
dependencies:
'@eslint-community/regexpp': 4.10.0
'@typescript-eslint/parser': 7.8.0(eslint@8.57.0)(typescript@5.4.5)
@ -3786,6 +3789,7 @@ snapshots:
natural-compare: 1.4.0
semver: 7.6.2
ts-api-utils: 1.3.0(typescript@5.4.5)
optionalDependencies:
typescript: 5.4.5
transitivePeerDependencies:
- supports-color
@ -3798,6 +3802,7 @@ snapshots:
'@typescript-eslint/visitor-keys': 7.8.0
debug: 4.3.4
eslint: 8.57.0
optionalDependencies:
typescript: 5.4.5
transitivePeerDependencies:
- supports-color
@ -3814,6 +3819,7 @@ snapshots:
debug: 4.3.4
eslint: 8.57.0
ts-api-utils: 1.3.0(typescript@5.4.5)
optionalDependencies:
typescript: 5.4.5
transitivePeerDependencies:
- supports-color
@ -3830,6 +3836,7 @@ snapshots:
minimatch: 9.0.4
semver: 7.6.2
ts-api-utils: 1.3.0(typescript@5.4.5)
optionalDependencies:
typescript: 5.4.5
transitivePeerDependencies:
- supports-color
@ -3909,11 +3916,11 @@ snapshots:
indent-string: 4.0.0
ajv-formats@2.1.1(ajv@8.13.0):
dependencies:
optionalDependencies:
ajv: 8.13.0
ajv-formats@3.0.1(ajv@8.13.0):
dependencies:
optionalDependencies:
ajv: 8.13.0
ajv@6.12.6:
@ -4300,6 +4307,7 @@ snapshots:
js-yaml: 4.1.0
parse-json: 5.2.0
path-type: 4.0.0
optionalDependencies:
typescript: 5.4.5
cross-spawn@7.0.3:
@ -5138,13 +5146,13 @@ snapshots:
kind-of@6.0.3: {}
lerna@8.1.2:
lerna@8.1.2(encoding@0.1.13):
dependencies:
'@lerna/create': 8.1.2(typescript@5.4.5)
'@lerna/create': 8.1.2(encoding@0.1.13)(typescript@5.4.5)
'@npmcli/run-script': 7.0.2
'@nx/devkit': 18.3.4(nx@18.3.4)
'@octokit/plugin-enterprise-rest': 6.0.1
'@octokit/rest': 19.0.11
'@octokit/rest': 19.0.11(encoding@0.1.13)
byte-size: 8.1.1
chalk: 4.1.0
clone-deep: 4.0.1
@ -5180,7 +5188,7 @@ snapshots:
make-dir: 4.0.0
minimatch: 3.0.5
multimatch: 5.0.0
node-fetch: 2.6.7
node-fetch: 2.6.7(encoding@0.1.13)
npm-package-arg: 8.1.1
npm-packlist: 5.1.1
npm-registry-fetch: 14.0.5
@ -5506,9 +5514,11 @@ snapshots:
node-cleanup@2.1.2: {}
node-fetch@2.6.7:
node-fetch@2.6.7(encoding@0.1.13):
dependencies:
whatwg-url: 5.0.0
optionalDependencies:
encoding: 0.1.13
node-gyp@10.1.0:
dependencies: