txtdot/packages/server/src/distributor.ts

109 lines
2.9 KiB
TypeScript
Raw Normal View History

import axios, { oaxios } from './types/axios';
import micromatch from 'micromatch';
import DOMPurify from 'dompurify';
import { Readable } from 'stream';
import { NotHtmlMimetypeError } from './errors/main';
import { decodeStream, parseEncodingName } from './utils/http';
import replaceHref from './utils/replace-href';
import { Engine } from '@txtdot/sdk';
2024-05-13 13:30:47 +03:00
import { HandlerInput, HandlerOutput } from '@txtdot/sdk';
import config from './config';
2024-05-13 16:35:36 +03:00
import { parseHTML } from 'linkedom';
import { html2text } from './utils/html2text';
interface IEngineId {
[key: string]: number;
}
export class Distributor {
engines_id: IEngineId = {};
fallback: Engine[] = [];
list: string[] = [];
constructor() {}
engine(engine: Engine) {
this.engines_id[engine.name] = this.list.length;
this.fallback.push(engine);
this.list.push(engine.name);
}
async handlePage(
remoteUrl: string, // remote URL
requestUrl: URL, // proxy URL
engineName?: string,
redirectPath: string = 'get'
2024-05-13 13:30:47 +03:00
): Promise<HandlerOutput> {
const urlObj = new URL(remoteUrl);
const webder_url = config.env.third_party.webder_url;
const response = webder_url
? await oaxios.get(
`${webder_url}/render?url=${encodeURIComponent(remoteUrl)}`
)
: await axios.get(remoteUrl);
const data: Readable = response.data;
const mime: string | undefined =
response.headers['content-type']?.toString();
if (mime && mime.indexOf('text/html') === -1) {
throw new NotHtmlMimetypeError();
}
const engine = this.getFallbackEngine(urlObj.hostname, engineName);
2024-05-13 16:35:36 +03:00
const output = await engine.handle(
new HandlerInput(
await decodeStream(data, parseEncodingName(mime)),
remoteUrl
)
);
2024-05-13 16:35:36 +03:00
const dom = parseHTML(output.content);
2024-05-14 13:34:25 +03:00
// Get text content before link replacement, because in text format we need original links
const stdTextContent = dom.document.documentElement.textContent;
// post-process
// TODO: generate dom in handler and not parse here twice
2024-05-13 13:30:47 +03:00
replaceHref(
2024-05-13 16:35:36 +03:00
dom.document,
2024-05-13 13:30:47 +03:00
requestUrl,
new URL(remoteUrl),
engineName,
redirectPath
);
2024-05-13 16:35:36 +03:00
const purify = DOMPurify(dom);
const content = purify.sanitize(dom.document.toString());
2024-05-14 13:11:59 +03:00
const title = output.title || dom.document.title;
const lang = output.lang || dom.document.documentElement.lang;
const textContent =
2024-05-14 13:34:25 +03:00
html2text(stdTextContent, output, title) ||
2024-05-14 13:11:59 +03:00
'Text output cannot be generated.';
2024-05-13 13:30:47 +03:00
return {
content,
2024-05-14 13:11:59 +03:00
textContent,
title,
lang,
2024-05-13 13:30:47 +03:00
};
}
getFallbackEngine(host: string, specified?: string): Engine {
if (specified) {
return this.fallback[this.engines_id[specified]];
}
2024-03-07 14:49:54 +03:00
for (const engine of this.fallback) {
if (micromatch.isMatch(host, engine.domains)) {
return engine;
}
}
return this.fallback[0];
}
}