feat: micromatch system to match domains

This commit is contained in:
Artemy Egorov 2023-09-04 21:34:36 +03:00
parent 3129cd97fc
commit 92ba77de77
5 changed files with 452 additions and 30 deletions

28
package-lock.json generated
View File

@ -20,12 +20,14 @@
"fastify": "^4.21.0", "fastify": "^4.21.0",
"ip-range-check": "^0.2.0", "ip-range-check": "^0.2.0",
"jsdom": "^22.1.0", "jsdom": "^22.1.0",
"json-schema-to-ts": "^2.9.2" "json-schema-to-ts": "^2.9.2",
"micromatch": "^4.0.5"
}, },
"devDependencies": { "devDependencies": {
"@types/ejs": "^3.1.2", "@types/ejs": "^3.1.2",
"@types/express": "^4.17.17", "@types/express": "^4.17.17",
"@types/jsdom": "^21.1.1", "@types/jsdom": "^21.1.1",
"@types/micromatch": "^4.0.2",
"@types/node": "^20.4.10", "@types/node": "^20.4.10",
"@typescript-eslint/eslint-plugin": "^6.3.0", "@typescript-eslint/eslint-plugin": "^6.3.0",
"@typescript-eslint/parser": "^6.3.0", "@typescript-eslint/parser": "^6.3.0",
@ -355,6 +357,12 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/braces": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/@types/braces/-/braces-3.0.2.tgz",
"integrity": "sha512-U5tlMYa0U/2eFTmJgKcPWQOEICP173sJDa6OjHbj5Tv+NVaYcrq2xmdWpNXOwWYGwJu+jER/pfTLdoQ31q8PzA==",
"dev": true
},
"node_modules/@types/connect": { "node_modules/@types/connect": {
"version": "3.4.35", "version": "3.4.35",
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.35.tgz", "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.35.tgz",
@ -415,6 +423,15 @@
"version": "7.0.12", "version": "7.0.12",
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/micromatch": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/micromatch/-/micromatch-4.0.2.tgz",
"integrity": "sha512-oqXqVb0ci19GtH0vOA/U2TmHTcRY9kuZl4mqUxe0QmJAlIW13kzhuK5pi1i9+ngav8FjpSb9FVS/GE00GLX1VA==",
"dev": true,
"dependencies": {
"@types/braces": "*"
}
},
"node_modules/@types/mime": { "node_modules/@types/mime": {
"version": "1.3.2", "version": "1.3.2",
"resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.2.tgz", "resolved": "https://registry.npmjs.org/@types/mime/-/mime-1.3.2.tgz",
@ -889,7 +906,6 @@
}, },
"node_modules/braces": { "node_modules/braces": {
"version": "3.0.2", "version": "3.0.2",
"dev": true,
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"fill-range": "^7.0.1" "fill-range": "^7.0.1"
@ -1660,7 +1676,6 @@
}, },
"node_modules/fill-range": { "node_modules/fill-range": {
"version": "7.0.1", "version": "7.0.1",
"dev": true,
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"to-regex-range": "^5.0.1" "to-regex-range": "^5.0.1"
@ -2038,7 +2053,6 @@
}, },
"node_modules/is-number": { "node_modules/is-number": {
"version": "7.0.0", "version": "7.0.0",
"dev": true,
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=0.12.0" "node": ">=0.12.0"
@ -2243,8 +2257,8 @@
}, },
"node_modules/micromatch": { "node_modules/micromatch": {
"version": "4.0.5", "version": "4.0.5",
"dev": true, "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz",
"license": "MIT", "integrity": "sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA==",
"dependencies": { "dependencies": {
"braces": "^3.0.2", "braces": "^3.0.2",
"picomatch": "^2.3.1" "picomatch": "^2.3.1"
@ -2492,7 +2506,6 @@
}, },
"node_modules/picomatch": { "node_modules/picomatch": {
"version": "2.3.1", "version": "2.3.1",
"dev": true,
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=8.6" "node": ">=8.6"
@ -3071,7 +3084,6 @@
}, },
"node_modules/to-regex-range": { "node_modules/to-regex-range": {
"version": "5.0.1", "version": "5.0.1",
"dev": true,
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"is-number": "^7.0.0" "is-number": "^7.0.0"

View File

@ -16,12 +16,14 @@
"fastify": "^4.21.0", "fastify": "^4.21.0",
"ip-range-check": "^0.2.0", "ip-range-check": "^0.2.0",
"jsdom": "^22.1.0", "jsdom": "^22.1.0",
"json-schema-to-ts": "^2.9.2" "json-schema-to-ts": "^2.9.2",
"micromatch": "^4.0.5"
}, },
"devDependencies": { "devDependencies": {
"@types/ejs": "^3.1.2", "@types/ejs": "^3.1.2",
"@types/express": "^4.17.17", "@types/express": "^4.17.17",
"@types/jsdom": "^21.1.1", "@types/jsdom": "^21.1.1",
"@types/micromatch": "^4.0.2",
"@types/node": "^20.4.10", "@types/node": "^20.4.10",
"@typescript-eslint/eslint-plugin": "^6.3.0", "@typescript-eslint/eslint-plugin": "^6.3.0",
"@typescript-eslint/parser": "^6.3.0", "@typescript-eslint/parser": "^6.3.0",

View File

@ -3,7 +3,7 @@ import { IHandlerOutput } from "./handler.interface";
import { EngineParseError } from "../errors/main"; import { EngineParseError } from "../errors/main";
export default async function google( export default async function google(
window: DOMWindow window: DOMWindow,
): Promise<IHandlerOutput> { ): Promise<IHandlerOutput> {
const googleAnchors = [ const googleAnchors = [
...window.document.querySelectorAll("a[jsname=ACyKwe]"), ...window.document.querySelectorAll("a[jsname=ACyKwe]"),
@ -22,7 +22,7 @@ export default async function google(
if (!googleAnchors) { if (!googleAnchors) {
throw new EngineParseError( throw new EngineParseError(
"Failed to find anchors in search result [google]" "Failed to find anchors in search result [google]",
); );
} }
@ -41,12 +41,12 @@ export default async function google(
}); });
const search = window.document.getElementById( const search = window.document.getElementById(
"APjFqb" "APjFqb",
) as HTMLTextAreaElement; ) as HTMLTextAreaElement;
const navLinks = [ const navLinks = [
...window.document.querySelectorAll( ...window.document.querySelectorAll(
"table[class=AaVjTc] > tbody > tr > td > a" "table[class=AaVjTc] > tbody > tr > td > a",
), ),
].map((l) => { ].map((l) => {
const link = l as HTMLAnchorElement; const link = l as HTMLAnchorElement;
@ -81,6 +81,383 @@ export default async function google(
}; };
} }
export const GoogleDomains = [
"google.com",
"google.ad",
"google.ae",
"google.com.af",
"google.com.ag",
"google.al",
"google.am",
"google.co.ao",
"google.com.ar",
"google.as",
"google.at",
"google.com.au",
"google.az",
"google.ba",
"google.com.bd",
"google.be",
"google.bf",
"google.bg",
"google.com.bh",
"google.bi",
"google.bj",
"google.com.bn",
"google.com.bo",
"google.com.br",
"google.bs",
"google.bt",
"google.co.bw",
"google.by",
"google.com.bz",
"google.ca",
"google.cd",
"google.cf",
"google.cg",
"google.ch",
"google.ci",
"google.co.ck",
"google.cl",
"google.cm",
"google.cn",
"google.com.co",
"google.co.cr",
"google.com.cu",
"google.cv",
"google.com.cy",
"google.cz",
"google.de",
"google.dj",
"google.dk",
"google.dm",
"google.com.do",
"google.dz",
"google.com.ec",
"google.ee",
"google.com.eg",
"google.es",
"google.com.et",
"google.fi",
"google.com.fj",
"google.fm",
"google.fr",
"google.ga",
"google.ge",
"google.gg",
"google.com.gh",
"google.com.gi",
"google.gl",
"google.gm",
"google.gr",
"google.com.gt",
"google.gy",
"google.com.hk",
"google.hn",
"google.hr",
"google.ht",
"google.hu",
"google.co.id",
"google.ie",
"google.co.il",
"google.im",
"google.co.in",
"google.iq",
"google.is",
"google.it",
"google.je",
"google.com.jm",
"google.jo",
"google.co.jp",
"google.co.ke",
"google.com.kh",
"google.ki",
"google.kg",
"google.co.kr",
"google.com.kw",
"google.kz",
"google.la",
"google.com.lb",
"google.li",
"google.lk",
"google.co.ls",
"google.lt",
"google.lu",
"google.lv",
"google.com.ly",
"google.co.ma",
"google.md",
"google.me",
"google.mg",
"google.mk",
"google.ml",
"google.com.mm",
"google.mn",
"google.com.mt",
"google.mu",
"google.mv",
"google.mw",
"google.com.mx",
"google.com.my",
"google.co.mz",
"google.com.na",
"google.com.ng",
"google.com.ni",
"google.ne",
"google.nl",
"google.no",
"google.com.np",
"google.nr",
"google.nu",
"google.co.nz",
"google.com.om",
"google.com.pa",
"google.com.pe",
"google.com.pg",
"google.com.ph",
"google.com.pk",
"google.pl",
"google.pn",
"google.com.pr",
"google.ps",
"google.pt",
"google.com.py",
"google.com.qa",
"google.ro",
"google.ru",
"google.rw",
"google.com.sa",
"google.com.sb",
"google.sc",
"google.se",
"google.com.sg",
"google.sh",
"google.si",
"google.sk",
"google.com.sl",
"google.sn",
"google.so",
"google.sm",
"google.sr",
"google.st",
"google.com.sv",
"google.td",
"google.tg",
"google.co.th",
"google.com.tj",
"google.tl",
"google.tm",
"google.tn",
"google.to",
"google.com.tr",
"google.tt",
"google.com.tw",
"google.co.tz",
"google.com.ua",
"google.co.ug",
"google.co.uk",
"google.com.uy",
"google.co.uz",
"google.com.vc",
"google.co.ve",
"google.co.vi",
"google.com.vn",
"google.vu",
"google.ws",
"google.rs",
"google.co.za",
"google.co.zm",
"google.co.zw",
"google.cat",
"www.google.com",
"www.google.ad",
"www.google.ae",
"www.google.com.af",
"www.google.com.ag",
"www.google.al",
"www.google.am",
"www.google.co.ao",
"www.google.com.ar",
"www.google.as",
"www.google.at",
"www.google.com.au",
"www.google.az",
"www.google.ba",
"www.google.com.bd",
"www.google.be",
"www.google.bf",
"www.google.bg",
"www.google.com.bh",
"www.google.bi",
"www.google.bj",
"www.google.com.bn",
"www.google.com.bo",
"www.google.com.br",
"www.google.bs",
"www.google.bt",
"www.google.co.bw",
"www.google.by",
"www.google.com.bz",
"www.google.ca",
"www.google.cd",
"www.google.cf",
"www.google.cg",
"www.google.ch",
"www.google.ci",
"www.google.co.ck",
"www.google.cl",
"www.google.cm",
"www.google.cn",
"www.google.com.co",
"www.google.co.cr",
"www.google.com.cu",
"www.google.cv",
"www.google.com.cy",
"www.google.cz",
"www.google.de",
"www.google.dj",
"www.google.dk",
"www.google.dm",
"www.google.com.do",
"www.google.dz",
"www.google.com.ec",
"www.google.ee",
"www.google.com.eg",
"www.google.es",
"www.google.com.et",
"www.google.fi",
"www.google.com.fj",
"www.google.fm",
"www.google.fr",
"www.google.ga",
"www.google.ge",
"www.google.gg",
"www.google.com.gh",
"www.google.com.gi",
"www.google.gl",
"www.google.gm",
"www.google.gr",
"www.google.com.gt",
"www.google.gy",
"www.google.com.hk",
"www.google.hn",
"www.google.hr",
"www.google.ht",
"www.google.hu",
"www.google.co.id",
"www.google.ie",
"www.google.co.il",
"www.google.im",
"www.google.co.in",
"www.google.iq",
"www.google.is",
"www.google.it",
"www.google.je",
"www.google.com.jm",
"www.google.jo",
"www.google.co.jp",
"www.google.co.ke",
"www.google.com.kh",
"www.google.ki",
"www.google.kg",
"www.google.co.kr",
"www.google.com.kw",
"www.google.kz",
"www.google.la",
"www.google.com.lb",
"www.google.li",
"www.google.lk",
"www.google.co.ls",
"www.google.lt",
"www.google.lu",
"www.google.lv",
"www.google.com.ly",
"www.google.co.ma",
"www.google.md",
"www.google.me",
"www.google.mg",
"www.google.mk",
"www.google.ml",
"www.google.com.mm",
"www.google.mn",
"www.google.com.mt",
"www.google.mu",
"www.google.mv",
"www.google.mw",
"www.google.com.mx",
"www.google.com.my",
"www.google.co.mz",
"www.google.com.na",
"www.google.com.ng",
"www.google.com.ni",
"www.google.ne",
"www.google.nl",
"www.google.no",
"www.google.com.np",
"www.google.nr",
"www.google.nu",
"www.google.co.nz",
"www.google.com.om",
"www.google.com.pa",
"www.google.com.pe",
"www.google.com.pg",
"www.google.com.ph",
"www.google.com.pk",
"www.google.pl",
"www.google.pn",
"www.google.com.pr",
"www.google.ps",
"www.google.pt",
"www.google.com.py",
"www.google.com.qa",
"www.google.ro",
"www.google.ru",
"www.google.rw",
"www.google.com.sa",
"www.google.com.sb",
"www.google.sc",
"www.google.se",
"www.google.com.sg",
"www.google.sh",
"www.google.si",
"www.google.sk",
"www.google.com.sl",
"www.google.sn",
"www.google.so",
"www.google.sm",
"www.google.sr",
"www.google.st",
"www.google.com.sv",
"www.google.td",
"www.google.tg",
"www.google.co.th",
"www.google.com.tj",
"www.google.tl",
"www.google.tm",
"www.google.tn",
"www.google.to",
"www.google.com.tr",
"www.google.tt",
"www.google.com.tw",
"www.google.co.tz",
"www.google.com.ua",
"www.google.co.ug",
"www.google.co.uk",
"www.google.com.uy",
"www.google.co.uz",
"www.google.com.vc",
"www.google.co.ve",
"www.google.co.vi",
"www.google.com.vn",
"www.google.vu",
"www.google.ws",
"www.google.rs",
"www.google.co.za",
"www.google.co.zm",
"www.google.co.zw",
"www.google.cat",
];
interface GoogleProps { interface GoogleProps {
href: string; href: string;
siteName: string; siteName: string;

View File

@ -5,22 +5,21 @@ import { JSDOM } from "jsdom";
import { DOMWindow } from "jsdom"; import { DOMWindow } from "jsdom";
import readability from "./readability"; import readability from "./readability";
import google from "./google"; import google, { GoogleDomains } from "./google";
import stackoverflow from "./stackoverflow/main"; import stackoverflow, { StackOverflowDomains } from "./stackoverflow/main";
import { generateProxyUrl } from "../utils/generate"; import { generateProxyUrl } from "../utils/generate";
import isLocalResource from "../utils/islocal"; import isLocalResource from "../utils/islocal";
import { import micromatch from "micromatch";
LocalResourceError,
NotHtmlMimetypeError, import { LocalResourceError, NotHtmlMimetypeError } from "../errors/main";
} from "../errors/main";
export default async function handlePage( export default async function handlePage(
url: string, // remote URL url: string, // remote URL
requestUrl: URL, // proxy URL requestUrl: URL, // proxy URL
engine?: string, engine?: string,
redirect_path: string = "get" redirect_path: string = "get",
): Promise<IHandlerOutput> { ): Promise<IHandlerOutput> {
const urlObj = new URL(url); const urlObj = new URL(url);
@ -39,7 +38,12 @@ export default async function handlePage(
[...window.document.getElementsByTagName("a")].forEach((link) => { [...window.document.getElementsByTagName("a")].forEach((link) => {
try { try {
link.href = generateProxyUrl(requestUrl, link.href, engine, redirect_path); link.href = generateProxyUrl(
requestUrl,
link.href,
engine,
redirect_path,
);
} catch (_err) { } catch (_err) {
// ignore TypeError: Invalid URL // ignore TypeError: Invalid URL
} }
@ -49,25 +53,41 @@ export default async function handlePage(
return engines[engine](window); return engines[engine](window);
} }
return fallback[urlObj.host]?.(window) || fallback["*"](window); for (let match of fallback) {
if (micromatch.isMatch(urlObj.hostname, match.pattern)) {
return match.engine(window);
}
}
return engines.readability(window);
} }
interface Engines { interface Engines {
[key: string]: EngineFunction; [key: string]: EngineFunction;
} }
type EngineFunction = (window: DOMWindow) => Promise<IHandlerOutput>;
export const engines: Engines = { export const engines: Engines = {
readability, readability,
google, google,
stackoverflow, stackoverflow,
}; };
type EngineFunction = (window: DOMWindow) => Promise<IHandlerOutput>;
export type EngineMatch = {
pattern: string | string[];
engine: EngineFunction;
};
export type EnginesMatch = EngineMatch[];
export const engineList: string[] = Object.keys(engines); export const engineList: string[] = Object.keys(engines);
const fallback: Engines = { export const fallback: EnginesMatch = [
"stackoverflow.com": engines.stackoverflow, {
"www.google.com": engines.google, pattern: GoogleDomains,
"*": engines.readability, engine: engines.google,
}; },
{
pattern: StackOverflowDomains,
engine: engines.stackoverflow,
},
];

View File

@ -4,7 +4,7 @@ import { EngineParseError } from "../../errors/main";
import qPostsHandler from "./questions-posts"; import qPostsHandler from "./questions-posts";
export default async function stackoverflow( export default async function stackoverflow(
window: DOMWindow window: DOMWindow,
): Promise<IHandlerOutput> { ): Promise<IHandlerOutput> {
const url = new URL(window.location.href); const url = new URL(window.location.href);
@ -29,3 +29,14 @@ export default async function stackoverflow(
return result; return result;
} }
export const StackOverflowDomains = [
"stackoverflow.com",
"*.stackoverflow.com",
"*.stackexchange.com",
"askubuntu.com",
"stackapps.com",
"mathoverflow.net",
"superuser.com",
"serverfault.com",
];