feat: auto html-to-text conversion

fixes: #100
This commit is contained in:
Artemy 2024-05-14 12:56:51 +03:00
parent 04322ea3c5
commit 9a6819e652
7 changed files with 90 additions and 2 deletions

View File

@ -19,7 +19,6 @@ Readability.route('*path', async (input, ro) => {
return {
content: parsed.content,
textContent: parsed.textContent,
title: parsed.title,
lang: parsed.lang,
};

View File

@ -30,6 +30,7 @@
"dotenv": "^16.3.1",
"ejs": "^3.1.10",
"fastify": "^4.26.2",
"html-to-text": "^9.0.5",
"iconv-lite": "^0.6.3",
"ip-range-check": "^0.2.0",
"json-schema-to-ts": "^3.0.1",
@ -40,6 +41,7 @@
"devDependencies": {
"@types/dompurify": "^3.0.5",
"@types/ejs": "^3.1.5",
"@types/html-to-text": "^9.0.4",
"@types/jsdom": "^21.1.6",
"@types/micromatch": "^4.0.7",
"clean-css-cli": "^5.6.3",

View File

@ -1,5 +1,6 @@
import { IAppConfig } from '../types/appConfig';
import { engineList } from '@txtdot/plugins';
import { compile } from 'html-to-text';
/**
* Configuration of plugins
@ -7,6 +8,7 @@ import { engineList } from '@txtdot/plugins';
*/
const plugin_config: IAppConfig = {
engines: [...engineList],
html2text: compile(),
};
export default plugin_config;

View File

@ -10,6 +10,7 @@ import { Engine } from '@txtdot/sdk';
import { HandlerInput, HandlerOutput } from '@txtdot/sdk';
import config from './config';
import { parseHTML } from 'linkedom';
import { html2text } from './utils/html2text';
interface IEngineId {
[key: string]: number;
@ -78,7 +79,7 @@ export class Distributor {
return {
content,
textContent:
output.textContent || dom.document.documentElement.textContent || '',
html2text(output, dom.document) || 'Text output cannot be generated.',
title: output.title || dom.document.title,
lang: output.lang || dom.document.documentElement.lang,
};

View File

@ -1,5 +1,8 @@
import { Engine } from '@txtdot/sdk';
type Html2TextConverter = (html: string) => string;
export interface IAppConfig {
engines: Engine[];
html2text?: Html2TextConverter;
}

View File

@ -0,0 +1,9 @@
import { EngineOutput } from '@txtdot/sdk/dist/types/handler';
import config from '../config';
export function html2text(output: EngineOutput, doc: Document) {
if (output.textContent) return output.textContent;
else if (config.plugin.html2text)
return config.plugin.html2text(output.content);
else return doc.documentElement.textContent;
}

View File

@ -103,6 +103,9 @@ importers:
fastify:
specifier: ^4.26.2
version: 4.27.0
html-to-text:
specifier: ^9.0.5
version: 9.0.5
iconv-lite:
specifier: ^0.6.3
version: 0.6.3
@ -128,6 +131,9 @@ importers:
'@types/ejs':
specifier: ^3.1.5
version: 3.1.5
'@types/html-to-text':
specifier: ^9.0.4
version: 9.0.4
'@types/jsdom':
specifier: ^21.1.6
version: 21.1.6
@ -543,6 +549,9 @@ packages:
resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==}
engines: {node: '>=14'}
'@selderee/plugin-htmlparser2@0.11.0':
resolution: {integrity: sha512-P33hHGdldxGabLFjPPpaTxVolMrzrcegejx+0GxjrIb9Zv48D8yAIA/QTDR2dFl7Uz7urX8aX6+5bCZslr+gWQ==}
'@sigstore/bundle@1.1.0':
resolution: {integrity: sha512-PFutXEy0SmQxYI4texPw3dd2KewuNqv7OuK1ZFtY2fM754yhvG2KdgwIhRnoEE2uHdtdGNQ8s0lb94dW9sELog==}
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
@ -615,6 +624,9 @@ packages:
'@types/ejs@3.1.5':
resolution: {integrity: sha512-nv+GSx77ZtXiJzwKdsASqi+YQ5Z7vwHsTP0JY2SiQgjGckkBRKZnk8nIM+7oUZ1VCtuTz0+By4qVR7fqzp/Dfg==}
'@types/html-to-text@9.0.4':
resolution: {integrity: sha512-pUY3cKH/Nm2yYrEmDlPR1mR7yszjGx4DrwPjQ702C4/D5CwHuZTgZdIdwPkRbcuhs7BAh2L5rg3CL5cbRiGTCQ==}
'@types/jsdom@21.1.6':
resolution: {integrity: sha512-/7kkMsC+/kMs7gAYmmBR9P0vGTnOoLhQhyhQJSlXGI5bzTHp6xdo0TtKWQAsz6pmSAeVqKSbqeyP6hytqr9FDw==}
@ -1170,6 +1182,10 @@ packages:
deep-is@0.1.4:
resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==}
deepmerge@4.3.1:
resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==}
engines: {node: '>=0.10.0'}
defaults@1.0.4:
resolution: {integrity: sha512-eFuaLoy/Rxalv2kr+lqMlUnrDWV+3j4pljOIJgLIhI058IQfWJ7vXhyEIHu+HtC738klGALYxOKDO0bQP3tg8A==}
@ -1637,6 +1653,13 @@ packages:
html-escaper@3.0.3:
resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==}
html-to-text@9.0.5:
resolution: {integrity: sha512-qY60FjREgVZL03vJU6IfMV4GDjGBIoOyvuFdpBDIX9yTlDw0TjxVBQp+P8NvpdIXNJvfWBTNul7fsAQJq2FNpg==}
engines: {node: '>=14'}
htmlparser2@8.0.2:
resolution: {integrity: sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==}
htmlparser2@9.1.0:
resolution: {integrity: sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==}
@ -1932,6 +1955,9 @@ packages:
resolution: {integrity: sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==}
engines: {node: '>=0.10.0'}
leac@0.6.0:
resolution: {integrity: sha512-y+SqErxb8h7nE/fiEX07jsbuhrpO9lL8eca7/Y1nuWV2moNlXhyd59iDGcRf6moVyDMbmTNzL40SUyrFU/yDpg==}
lerna@8.1.2:
resolution: {integrity: sha512-RCyBAn3XsqqvHbz3TxLfD7ylqzCi1A2UJnFEZmhURgx589vM3qYWQa/uOMeEEf565q6cAdtmulITciX1wgkAtw==}
engines: {node: '>=18.0.0'}
@ -2427,6 +2453,9 @@ packages:
parse5@7.1.2:
resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==}
parseley@0.12.1:
resolution: {integrity: sha512-e6qHKe3a9HWr0oMRVDTRhKce+bRO8VGQR3NyVwcjwrbhMmFCX9KszEV35+rn4AdilFAq9VPxP/Fe1wC9Qjd2lw==}
path-exists@3.0.0:
resolution: {integrity: sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==}
engines: {node: '>=4'}
@ -2461,6 +2490,9 @@ packages:
pause-stream@0.0.11:
resolution: {integrity: sha512-e3FBlXLmN/D1S+zHzanP4E/4Z60oFAa3O051qt1pxa7DEJWKAyil6upYVXCWadEnuoqa4Pkc9oUx9zsxYeRv8A==}
peberminta@0.9.0:
resolution: {integrity: sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ==}
picocolors@1.0.0:
resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==}
@ -2738,6 +2770,9 @@ packages:
secure-json-parse@2.7.0:
resolution: {integrity: sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==}
selderee@0.11.0:
resolution: {integrity: sha512-5TF+l7p4+OsnP8BCCvSyZiSPc4x4//p5uPwK8TCnVPJYRmU2aYKMpOXvw8zM5a5JvuuCGN1jmsMwuU2W02ukfA==}
semver@5.7.2:
resolution: {integrity: sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==}
hasBin: true
@ -3701,6 +3736,11 @@ snapshots:
'@pkgjs/parseargs@0.11.0':
optional: true
'@selderee/plugin-htmlparser2@0.11.0':
dependencies:
domhandler: 5.0.3
selderee: 0.11.0
'@sigstore/bundle@1.1.0':
dependencies:
'@sigstore/protobuf-specs': 0.2.1
@ -3780,6 +3820,8 @@ snapshots:
'@types/ejs@3.1.5': {}
'@types/html-to-text@9.0.4': {}
'@types/jsdom@21.1.6':
dependencies:
'@types/node': 20.12.11
@ -4387,6 +4429,8 @@ snapshots:
deep-is@0.1.4: {}
deepmerge@4.3.1: {}
defaults@1.0.4:
dependencies:
clone: 1.0.4
@ -4902,6 +4946,21 @@ snapshots:
html-escaper@3.0.3: {}
html-to-text@9.0.5:
dependencies:
'@selderee/plugin-htmlparser2': 0.11.0
deepmerge: 4.3.1
dom-serializer: 2.0.0
htmlparser2: 8.0.2
selderee: 0.11.0
htmlparser2@8.0.2:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.1.0
entities: 4.5.0
htmlparser2@9.1.0:
dependencies:
domelementtype: 2.3.0
@ -5194,6 +5253,8 @@ snapshots:
kind-of@6.0.3: {}
leac@0.6.0: {}
lerna@8.1.2(encoding@0.1.13):
dependencies:
'@lerna/create': 8.1.2(encoding@0.1.13)(typescript@5.4.5)
@ -5925,6 +5986,11 @@ snapshots:
dependencies:
entities: 4.5.0
parseley@0.12.1:
dependencies:
leac: 0.6.0
peberminta: 0.9.0
path-exists@3.0.0: {}
path-exists@4.0.0: {}
@ -5950,6 +6016,8 @@ snapshots:
dependencies:
through: 2.3.8
peberminta@0.9.0: {}
picocolors@1.0.0: {}
picomatch@2.3.1: {}
@ -6225,6 +6293,10 @@ snapshots:
secure-json-parse@2.7.0: {}
selderee@0.11.0:
dependencies:
parseley: 0.12.1
semver@5.7.2: {}
semver@7.6.2: {}