Merge pull request #10 from TxtDot/google-parsing

Google parsing
This commit is contained in:
Andrey 2023-08-15 14:42:47 +04:00 committed by GitHub
commit 17e5773988
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 77 additions and 15 deletions

45
src/handlers/google.ts Normal file
View File

@ -0,0 +1,45 @@
import { DOMWindow } from "jsdom";
import { IHandlerOutput } from "./handler.interface";
export default async function google(
window: DOMWindow
): Promise<IHandlerOutput> {
const googleAnchors = window.document.querySelectorAll(
"#rso > div > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > a:nth-child(1)"
);
if (!googleAnchors) {
throw new Error("Failed to find anchors in search result [google]");
}
const results = [...googleAnchors];
const convertToFormat = (result: Element, isHtml: boolean) => {
const anchor = result as HTMLAnchorElement;
const heading = anchor.childNodes[1] as HTMLHeadingElement;
return isHtml
? `<p><a href="${anchor.href}">${heading.innerHTML}</p>`
: `${heading.innerHTML} > ${anchor.href}`;
};
const content = results.map((result) => {
return convertToFormat(result, true);
});
const textContent = results.map((result) => {
return convertToFormat(result, false);
});
const searchForm = `
<form onsubmit="window.location.href = '/?url=https://www.google.com/search?q='+ document.getElementById('q').value.split(' ').join('+'); return false">
<input type="text" name="q" id="q">
<input type="button" value="Search" onclick="window.location.href = '/?url=https://www.google.com/search?q='+ document.getElementById('q').value.split(' ').join('+');">
</form>
`;
return {
content: `${searchForm}${content.join("")}`,
textContent: textContent.join("\n"),
title: window.document.title,
lang: window.document.documentElement.lang,
};
}

View File

@ -1,10 +1,11 @@
import axios from "../types/axios";
import { IHandlerOutput } from "./handler.interface";
import { readability } from "./readability";
import axios from "../types/axios";
import { JSDOM } from "jsdom";
type EngineFunction = (url: Document) => Promise<IHandlerOutput>;
import readability from "./readability";
import google from "./google";
import { DOMWindow } from "jsdom";
export default async function handlePage(
url: string,
originalUrl: string,
@ -15,33 +16,39 @@ export default async function handlePage(
}
const response = await axios.get(url);
const document = new JSDOM(response.data, { url: url }).window.document;
const window = new JSDOM(response.data, { url: url }).window;
const UrlParsed = new URL(originalUrl);
[...document.getElementsByTagName("a")].forEach((link) => {
[...window.document.getElementsByTagName("a")].forEach((link) => {
link.href = `${UrlParsed.origin}/?url=${link.href}${
engine && `&engine=${engine}`
engine ? `&engine=${engine}` : ""
}`;
});
if (engine) {
return engines[engine](document);
return engines[engine](window);
}
const host = new URL(url).hostname;
return fallback[host](document) || fallback["*"](document);
return fallback[host]?.(window) || fallback["*"](window);
}
interface Engines {
[key: string]: EngineFunction;
}
type EngineFunction = (window: DOMWindow) => Promise<IHandlerOutput>;
export const engines: Engines = {
readability,
google,
};
export const engineList: string[] = Object.keys(engines);
const fallback: Engines = {
"www.google.com": engines.google,
"*": engines.readability,
};

View File

@ -1,8 +1,11 @@
import { Readability } from "@mozilla/readability";
import { IHandlerOutput } from "./handler.interface";
import { DOMWindow } from "jsdom";
export async function readability(document: Document): Promise<IHandlerOutput> {
const reader = new Readability(document);
export default async function readability(
window: DOMWindow
): Promise<IHandlerOutput> {
const reader = new Readability(window.document);
const parsed = reader.parse();
if (!parsed) {

View File

@ -7,7 +7,7 @@ import { generateOriginUrl } from "../utils";
export default async function mainRoute(fastify: FastifyInstance) {
fastify.get("/", async (request: GetRequest, reply) => {
const remoteUrl = request.query.url;
const engine = request.query.engine || "readability";
const engine = request.query.engine;
let format: string;

View File

@ -1,7 +1,8 @@
import { FastifyInstance } from "fastify";
import { engineList } from "../handlers/main";
export default async function parseRoute(fastify: FastifyInstance) {
fastify.get("/start", async (_, reply) => {
return reply.view("/templates/start.ejs");
return reply.view("/templates/start.ejs", { engineList });
});
}

View File

@ -2,6 +2,7 @@ import axios from "axios";
export default axios.create({
headers: {
"User-Agent": "txtdot",
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
},
});

View File

@ -17,7 +17,12 @@
<label for="engine">Engine</label>
<select name="engine">
<option selected value>Standard</option>
<option value="readability">Readability</option>
<% engineList.forEach((engine)=>{
%>
<option value="<%= engine %>">
<%= engine %>
</option>
<% }) %>
</select>
</p>
<p>