diff --git a/packages/plugins/src/engines/readability.ts b/packages/plugins/src/engines/readability.ts index daf3f1c..f35d8f3 100644 --- a/packages/plugins/src/engines/readability.ts +++ b/packages/plugins/src/engines/readability.ts @@ -19,7 +19,6 @@ Readability.route('*path', async (input, ro) => { return { content: parsed.content, - textContent: parsed.textContent, title: parsed.title, lang: parsed.lang, }; diff --git a/packages/server/package.json b/packages/server/package.json index 077012f..f3c2f99 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -30,6 +30,7 @@ "dotenv": "^16.3.1", "ejs": "^3.1.10", "fastify": "^4.26.2", + "html-to-text": "^9.0.5", "iconv-lite": "^0.6.3", "ip-range-check": "^0.2.0", "json-schema-to-ts": "^3.0.1", @@ -40,6 +41,7 @@ "devDependencies": { "@types/dompurify": "^3.0.5", "@types/ejs": "^3.1.5", + "@types/html-to-text": "^9.0.4", "@types/jsdom": "^21.1.6", "@types/micromatch": "^4.0.7", "clean-css-cli": "^5.6.3", diff --git a/packages/server/src/config/pluginConfig.ts b/packages/server/src/config/pluginConfig.ts index 0d51d9f..349e6af 100644 --- a/packages/server/src/config/pluginConfig.ts +++ b/packages/server/src/config/pluginConfig.ts @@ -1,5 +1,6 @@ import { IAppConfig } from '../types/appConfig'; import { engineList } from '@txtdot/plugins'; +import { compile } from 'html-to-text'; /** * Configuration of plugins @@ -7,6 +8,7 @@ import { engineList } from '@txtdot/plugins'; */ const plugin_config: IAppConfig = { engines: [...engineList], + html2text: compile(), }; export default plugin_config; diff --git a/packages/server/src/distributor.ts b/packages/server/src/distributor.ts index 1709743..e92d11f 100644 --- a/packages/server/src/distributor.ts +++ b/packages/server/src/distributor.ts @@ -10,6 +10,7 @@ import { Engine } from '@txtdot/sdk'; import { HandlerInput, HandlerOutput } from '@txtdot/sdk'; import config from './config'; import { parseHTML } from 'linkedom'; +import { html2text } from './utils/html2text'; interface IEngineId { [key: string]: number; @@ -78,7 +79,7 @@ export class Distributor { return { content, textContent: - output.textContent || dom.document.documentElement.textContent || '', + html2text(output, dom.document) || 'Text output cannot be generated.', title: output.title || dom.document.title, lang: output.lang || dom.document.documentElement.lang, }; diff --git a/packages/server/src/types/appConfig.ts b/packages/server/src/types/appConfig.ts index 49bccd6..efad8af 100644 --- a/packages/server/src/types/appConfig.ts +++ b/packages/server/src/types/appConfig.ts @@ -1,5 +1,8 @@ import { Engine } from '@txtdot/sdk'; +type Html2TextConverter = (html: string) => string; + export interface IAppConfig { engines: Engine[]; + html2text?: Html2TextConverter; } diff --git a/packages/server/src/utils/html2text.ts b/packages/server/src/utils/html2text.ts new file mode 100644 index 0000000..4f2471b --- /dev/null +++ b/packages/server/src/utils/html2text.ts @@ -0,0 +1,9 @@ +import { EngineOutput } from '@txtdot/sdk/dist/types/handler'; +import config from '../config'; + +export function html2text(output: EngineOutput, doc: Document) { + if (output.textContent) return output.textContent; + else if (config.plugin.html2text) + return config.plugin.html2text(output.content); + else return doc.documentElement.textContent; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f9a3b0d..444be98 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -103,6 +103,9 @@ importers: fastify: specifier: ^4.26.2 version: 4.27.0 + html-to-text: + specifier: ^9.0.5 + version: 9.0.5 iconv-lite: specifier: ^0.6.3 version: 0.6.3 @@ -128,6 +131,9 @@ importers: '@types/ejs': specifier: ^3.1.5 version: 3.1.5 + '@types/html-to-text': + specifier: ^9.0.4 + version: 9.0.4 '@types/jsdom': specifier: ^21.1.6 version: 21.1.6 @@ -543,6 +549,9 @@ packages: resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} + '@selderee/plugin-htmlparser2@0.11.0': + resolution: {integrity: sha512-P33hHGdldxGabLFjPPpaTxVolMrzrcegejx+0GxjrIb9Zv48D8yAIA/QTDR2dFl7Uz7urX8aX6+5bCZslr+gWQ==} + '@sigstore/bundle@1.1.0': resolution: {integrity: sha512-PFutXEy0SmQxYI4texPw3dd2KewuNqv7OuK1ZFtY2fM754yhvG2KdgwIhRnoEE2uHdtdGNQ8s0lb94dW9sELog==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -615,6 +624,9 @@ packages: '@types/ejs@3.1.5': resolution: {integrity: sha512-nv+GSx77ZtXiJzwKdsASqi+YQ5Z7vwHsTP0JY2SiQgjGckkBRKZnk8nIM+7oUZ1VCtuTz0+By4qVR7fqzp/Dfg==} + '@types/html-to-text@9.0.4': + resolution: {integrity: sha512-pUY3cKH/Nm2yYrEmDlPR1mR7yszjGx4DrwPjQ702C4/D5CwHuZTgZdIdwPkRbcuhs7BAh2L5rg3CL5cbRiGTCQ==} + '@types/jsdom@21.1.6': resolution: {integrity: sha512-/7kkMsC+/kMs7gAYmmBR9P0vGTnOoLhQhyhQJSlXGI5bzTHp6xdo0TtKWQAsz6pmSAeVqKSbqeyP6hytqr9FDw==} @@ -1170,6 +1182,10 @@ packages: deep-is@0.1.4: resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==} + deepmerge@4.3.1: + resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==} + engines: {node: '>=0.10.0'} + defaults@1.0.4: resolution: {integrity: sha512-eFuaLoy/Rxalv2kr+lqMlUnrDWV+3j4pljOIJgLIhI058IQfWJ7vXhyEIHu+HtC738klGALYxOKDO0bQP3tg8A==} @@ -1637,6 +1653,13 @@ packages: html-escaper@3.0.3: resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==} + html-to-text@9.0.5: + resolution: {integrity: sha512-qY60FjREgVZL03vJU6IfMV4GDjGBIoOyvuFdpBDIX9yTlDw0TjxVBQp+P8NvpdIXNJvfWBTNul7fsAQJq2FNpg==} + engines: {node: '>=14'} + + htmlparser2@8.0.2: + resolution: {integrity: sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==} + htmlparser2@9.1.0: resolution: {integrity: sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==} @@ -1932,6 +1955,9 @@ packages: resolution: {integrity: sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==} engines: {node: '>=0.10.0'} + leac@0.6.0: + resolution: {integrity: sha512-y+SqErxb8h7nE/fiEX07jsbuhrpO9lL8eca7/Y1nuWV2moNlXhyd59iDGcRf6moVyDMbmTNzL40SUyrFU/yDpg==} + lerna@8.1.2: resolution: {integrity: sha512-RCyBAn3XsqqvHbz3TxLfD7ylqzCi1A2UJnFEZmhURgx589vM3qYWQa/uOMeEEf565q6cAdtmulITciX1wgkAtw==} engines: {node: '>=18.0.0'} @@ -2427,6 +2453,9 @@ packages: parse5@7.1.2: resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==} + parseley@0.12.1: + resolution: {integrity: sha512-e6qHKe3a9HWr0oMRVDTRhKce+bRO8VGQR3NyVwcjwrbhMmFCX9KszEV35+rn4AdilFAq9VPxP/Fe1wC9Qjd2lw==} + path-exists@3.0.0: resolution: {integrity: sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==} engines: {node: '>=4'} @@ -2461,6 +2490,9 @@ packages: pause-stream@0.0.11: resolution: {integrity: sha512-e3FBlXLmN/D1S+zHzanP4E/4Z60oFAa3O051qt1pxa7DEJWKAyil6upYVXCWadEnuoqa4Pkc9oUx9zsxYeRv8A==} + peberminta@0.9.0: + resolution: {integrity: sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ==} + picocolors@1.0.0: resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} @@ -2738,6 +2770,9 @@ packages: secure-json-parse@2.7.0: resolution: {integrity: sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==} + selderee@0.11.0: + resolution: {integrity: sha512-5TF+l7p4+OsnP8BCCvSyZiSPc4x4//p5uPwK8TCnVPJYRmU2aYKMpOXvw8zM5a5JvuuCGN1jmsMwuU2W02ukfA==} + semver@5.7.2: resolution: {integrity: sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==} hasBin: true @@ -3701,6 +3736,11 @@ snapshots: '@pkgjs/parseargs@0.11.0': optional: true + '@selderee/plugin-htmlparser2@0.11.0': + dependencies: + domhandler: 5.0.3 + selderee: 0.11.0 + '@sigstore/bundle@1.1.0': dependencies: '@sigstore/protobuf-specs': 0.2.1 @@ -3780,6 +3820,8 @@ snapshots: '@types/ejs@3.1.5': {} + '@types/html-to-text@9.0.4': {} + '@types/jsdom@21.1.6': dependencies: '@types/node': 20.12.11 @@ -4387,6 +4429,8 @@ snapshots: deep-is@0.1.4: {} + deepmerge@4.3.1: {} + defaults@1.0.4: dependencies: clone: 1.0.4 @@ -4902,6 +4946,21 @@ snapshots: html-escaper@3.0.3: {} + html-to-text@9.0.5: + dependencies: + '@selderee/plugin-htmlparser2': 0.11.0 + deepmerge: 4.3.1 + dom-serializer: 2.0.0 + htmlparser2: 8.0.2 + selderee: 0.11.0 + + htmlparser2@8.0.2: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.1.0 + entities: 4.5.0 + htmlparser2@9.1.0: dependencies: domelementtype: 2.3.0 @@ -5194,6 +5253,8 @@ snapshots: kind-of@6.0.3: {} + leac@0.6.0: {} + lerna@8.1.2(encoding@0.1.13): dependencies: '@lerna/create': 8.1.2(encoding@0.1.13)(typescript@5.4.5) @@ -5925,6 +5986,11 @@ snapshots: dependencies: entities: 4.5.0 + parseley@0.12.1: + dependencies: + leac: 0.6.0 + peberminta: 0.9.0 + path-exists@3.0.0: {} path-exists@4.0.0: {} @@ -5950,6 +6016,8 @@ snapshots: dependencies: through: 2.3.8 + peberminta@0.9.0: {} + picocolors@1.0.0: {} picomatch@2.3.1: {} @@ -6225,6 +6293,10 @@ snapshots: secure-json-parse@2.7.0: {} + selderee@0.11.0: + dependencies: + parseley: 0.12.1 + semver@5.7.2: {} semver@7.6.2: {}