From 6e47a6fba7197cd6b9da93c1bea81f9d182be264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pascal=20H=C3=B6hnel?= Date: Fri, 17 Jan 2025 08:24:03 +0100 Subject: [PATCH 1/3] add error message, if returned HTML does not contain required elements --- utils/scraper.ts | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/utils/scraper.ts b/utils/scraper.ts index 2527a10..a78401e 100644 --- a/utils/scraper.ts +++ b/utils/scraper.ts @@ -129,9 +129,11 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett refreshedResults.error = `[${error.response.status}] ${error.response.statusText}`; } - console.log('[ERROR] Scraping Keyword : ', keyword.keyword, '. Error: ', error && error.response && error.response.statusText); + console.log('[ERROR] Scraping Keyword : ', keyword.keyword); if (!(error && error.response && error.response.statusText)) { console.log('[ERROR_MESSAGE]: ', error); + } else { + console.log('[ERROR_MESSAGE]: ', error && error.response && error.response.statusText); } } @@ -148,9 +150,17 @@ export const extractScrapedResult = (content: string, device: string): SearchRes const extractedResult = []; const $ = cheerio.load(content); + const hasValidContent = [...$('body').find('#search'), ...$('body').find('#rso')]; + if (hasValidContent.length == 0) { + const msg = '[ERROR] Scraped search results do not adhere to expected format. Unable to parse results'; + console.log(msg); + throw new Error(msg); + } + const hasNumberofResult = $('body').find('#search > div > div'); const searchResultItems = hasNumberofResult.find('h3'); let lastPosition = 0; + console.log('Scraped search results contain ', searchResultItems.length, ' desktop results.'); for (let i = 0; i < searchResultItems.length; i += 1) { if (searchResultItems[i]) { @@ -161,11 +171,12 @@ export const extractScrapedResult = (content: string, device: string): SearchRes extractedResult.push({ title, url, position: lastPosition }); } } - } + } - // Mobile Scraper - if (extractedResult.length === 0 && device === 'mobile') { + // Mobile Scraper + if (extractedResult.length === 0 && device === 'mobile') { const items = $('body').find('#rso > div'); + console.log('Scraped search results contain ', items.length, ' mobile results.'); for (let i = 0; i < items.length; i += 1) { const item = $(items[i]); const linkDom = item.find('a[role="presentation"]'); @@ -181,7 +192,7 @@ export const extractScrapedResult = (content: string, device: string): SearchRes } } - return extractedResult; + return extractedResult; }; /** From cab8f518bbc03c0486072485c60053e06b401eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pascal=20H=C3=B6hnel?= Date: Fri, 17 Jan 2025 09:10:43 +0100 Subject: [PATCH 2/3] also add result-check to proxy --- scrapers/services/proxy.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scrapers/services/proxy.ts b/scrapers/services/proxy.ts index c37cd1e..88f3e5a 100644 --- a/scrapers/services/proxy.ts +++ b/scrapers/services/proxy.ts @@ -16,6 +16,13 @@ const proxy:ScraperSettings = { const $ = cheerio.load(content); let lastPosition = 0; + const hasValidContent = $('body').find('#main') + if (hasValidContent.length == 0) { + const msg = '[ERROR] Scraped search results from proxy do not adhere to expected format. Unable to parse results'; + console.log(msg); + throw new Error(msg); + } + const mainContent = $('body').find('#main'); const children = $(mainContent).find('h3'); From c34c8260c7268c1e4221ea8d7cff74b0693bcfd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pascal=20H=C3=B6hnel?= Date: Fri, 17 Jan 2025 09:36:41 +0100 Subject: [PATCH 3/3] improve error message in UI on proxy use --- utils/scraper.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/scraper.ts b/utils/scraper.ts index a78401e..83efb84 100644 --- a/utils/scraper.ts +++ b/utils/scraper.ts @@ -127,6 +127,8 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett refreshedResults.error = scraperError || 'Unknown Error'; if (settings.scraper_type === 'proxy' && error && error.response && error.response.statusText) { refreshedResults.error = `[${error.response.status}] ${error.response.statusText}`; + } else if (settings.scraper_type === 'proxy' && error) { + refreshedResults.error = error; } console.log('[ERROR] Scraping Keyword : ', keyword.keyword);