From eb7d0dc1f85baab193366d475cf699d999a5a9c1 Mon Sep 17 00:00:00 2001 From: Chip Wasson Date: Mon, 9 Oct 2023 16:08:39 -0600 Subject: [PATCH] Add Open Graph scraper module --- package.json | 1 + src/app.module.ts | 2 + src/ogscraper/ogscraper.controller.ts | 25 +++++ src/ogscraper/ogscraper.module.ts | 10 ++ src/ogscraper/ogscraper.service.ts | 20 ++++ yarn.lock | 145 ++++++++++++++++++++++++++ 6 files changed, 203 insertions(+) create mode 100644 src/ogscraper/ogscraper.controller.ts create mode 100644 src/ogscraper/ogscraper.module.ts create mode 100644 src/ogscraper/ogscraper.service.ts diff --git a/package.json b/package.json index d801147..46b2419 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,7 @@ "cache-manager": "^5.2.3", "hbs": "^4.2.0", "minio": "^7.1.3", + "open-graph-scraper": "^6.3.0", "ramda": "^0.29.0", "reflect-metadata": "^0.1.13", "rxjs": "^7.8.1", diff --git a/src/app.module.ts b/src/app.module.ts index cf2c896..1690271 100644 --- a/src/app.module.ts +++ b/src/app.module.ts @@ -12,6 +12,7 @@ import { CacheModule } from '@nestjs/cache-manager'; import { IrcbotModule } from './ircbot/ircbot.module'; import { IrcbotService } from './ircbot/ircbot.service'; import { DinosaurwetModule } from './dinosaurwet/dinosaurwet.module'; +import { OgScraperModule } from './ogscraper/ogscraper.module'; @Module({ imports: [ @@ -27,6 +28,7 @@ import { DinosaurwetModule } from './dinosaurwet/dinosaurwet.module'; DomainrproxyModule, IrcbotModule, DinosaurwetModule, + OgScraperModule, ], controllers: [AppController], providers: [AppService], diff --git a/src/ogscraper/ogscraper.controller.ts b/src/ogscraper/ogscraper.controller.ts new file mode 100644 index 0000000..0e26786 --- /dev/null +++ b/src/ogscraper/ogscraper.controller.ts @@ -0,0 +1,25 @@ +import { Body, Controller, Post } from '@nestjs/common'; +import { ApiProperty, ApiTags } from '@nestjs/swagger'; +const ogs = require('open-graph-scraper'); +import { SuccessResult } from 'open-graph-scraper'; +import { OgScraperService } from './ogscraper.service'; + +class ScrapeOgDto { + @ApiProperty({ + description: 'URL of the page to fetch Open Graph metadata of', + example: + 'https://qz.com/1903322/why-pivot-tables-are-the-spreadsheets-most-powerful-tool', + }) + url: string; +} + +@Controller('ogscraper') +@ApiTags('open-graph-scraper') +export class OgScraperController { + constructor(private readonly ogScraperService: OgScraperService) {} + + @Post('') + async scrapeOg(@Body() body: ScrapeOgDto): Promise { + return this.ogScraperService.getOg(body.url); + } +} diff --git a/src/ogscraper/ogscraper.module.ts b/src/ogscraper/ogscraper.module.ts new file mode 100644 index 0000000..2dbb21f --- /dev/null +++ b/src/ogscraper/ogscraper.module.ts @@ -0,0 +1,10 @@ +import { Module } from '@nestjs/common'; +import { OgScraperController } from './ogscraper.controller'; +import { OgScraperService } from './ogscraper.service'; + +@Module({ + controllers: [OgScraperController], + providers: [OgScraperService], + exports: [OgScraperService], +}) +export class OgScraperModule {} diff --git a/src/ogscraper/ogscraper.service.ts b/src/ogscraper/ogscraper.service.ts new file mode 100644 index 0000000..29f8de1 --- /dev/null +++ b/src/ogscraper/ogscraper.service.ts @@ -0,0 +1,20 @@ +import { Injectable } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +const ogs = require('open-graph-scraper'); +import { SuccessResult } from 'open-graph-scraper'; + +@Injectable() +export class OgScraperService { + constructor(public readonly configService: ConfigService) {} + + async getOg(url: string): Promise { + return ogs({ + url, + fetchOptions: { + headers: { + 'user-agent': this.configService.get('userAgent') || '', + }, + }, + }); + } +} diff --git a/yarn.lock b/yarn.lock index 8d3da6a..d9a10a8 100644 --- a/yarn.lock +++ b/yarn.lock @@ -402,6 +402,11 @@ resolved "https://registry.yarnpkg.com/@eslint/js/-/js-8.48.0.tgz#642633964e217905436033a2bd08bf322849b7fb" integrity sha512-ZSjtmelB7IJfWD2Fvb7+Z+ChTIKWq6kjda95fLcQKNS5aheVHn4IkfgRQE3sIIzTcSLwLcLZUD9UBt+V7+h+Pw== +"@fastify/busboy@^2.0.0": + version "2.0.0" + resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.0.0.tgz#f22824caff3ae506b18207bad4126dbc6ccdb6b8" + integrity sha512-JUFJad5lv7jxj926GPgymrWQxxjPYuJNiNjNMzqT+HiuP6Vl3dk5xzG+8sTX96np0ZAluvaMzPsjhHZ5rNuNQQ== + "@humanwhocodes/config-array@^0.11.10": version "0.11.11" resolved "https://registry.yarnpkg.com/@humanwhocodes/config-array/-/config-array-0.11.11.tgz#88a04c570dbbc7dd943e4712429c3df09bc32844" @@ -1715,6 +1720,11 @@ body-parser@1.20.2: type-is "~1.6.18" unpipe "1.0.0" +boolbase@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" + integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww== + bplist-parser@^0.2.0: version "0.2.0" resolved "https://registry.yarnpkg.com/bplist-parser/-/bplist-parser-0.2.0.tgz#43a9d183e5bf9d545200ceac3e712f79ebbe8d0e" @@ -1878,6 +1888,36 @@ chardet@^0.7.0: resolved "https://registry.yarnpkg.com/chardet/-/chardet-0.7.0.tgz#90094849f0937f2eedc2425d0d28a9e5f0cbad9e" integrity sha512-mT8iDcrh03qDGRRmoA2hmBJnxpllMR+0/0qlzjqZES6NdiWDcZkCNAk4rPFZ9Q85r27unkiNNg8ZOiwZXBHwcA== +chardet@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/chardet/-/chardet-2.0.0.tgz#24a74bd3954965ba2d30f490a915fc3cf053051f" + integrity sha512-xVgPpulCooDjY6zH4m9YW3jbkaBe3FKIAvF5sj5t7aBNsVl2ljIE+xwJ4iNgiDZHFQvNIpjdKdVOQvvk5ZfxbQ== + +cheerio-select@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4" + integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g== + dependencies: + boolbase "^1.0.0" + css-select "^5.1.0" + css-what "^6.1.0" + domelementtype "^2.3.0" + domhandler "^5.0.3" + domutils "^3.0.1" + +cheerio@^1.0.0-rc.12: + version "1.0.0-rc.12" + resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683" + integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q== + dependencies: + cheerio-select "^2.1.0" + dom-serializer "^2.0.0" + domhandler "^5.0.3" + domutils "^3.0.1" + htmlparser2 "^8.0.1" + parse5 "^7.0.0" + parse5-htmlparser2-tree-adapter "^7.0.0" + chokidar@3.5.3, chokidar@^3.5.3: version "3.5.3" resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd" @@ -2125,6 +2165,22 @@ cross-spawn@^7.0.0, cross-spawn@^7.0.2, cross-spawn@^7.0.3: shebang-command "^2.0.0" which "^2.0.1" +css-select@^5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6" + integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg== + dependencies: + boolbase "^1.0.0" + css-what "^6.1.0" + domhandler "^5.0.2" + domutils "^3.0.1" + nth-check "^2.0.1" + +css-what@^6.1.0: + version "6.1.0" + resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4" + integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw== + debug@2.6.9: version "2.6.9" resolved "https://registry.yarnpkg.com/debug/-/debug-2.6.9.tgz#5d128515df134ff327e90a4c93f4e077a536341f" @@ -2251,6 +2307,36 @@ doctrine@^3.0.0: dependencies: esutils "^2.0.2" +dom-serializer@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53" + integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg== + dependencies: + domelementtype "^2.3.0" + domhandler "^5.0.2" + entities "^4.2.0" + +domelementtype@^2.3.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d" + integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw== + +domhandler@^5.0.2, domhandler@^5.0.3: + version "5.0.3" + resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31" + integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w== + dependencies: + domelementtype "^2.3.0" + +domutils@^3.0.1: + version "3.1.0" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.1.0.tgz#c47f551278d3dc4b0b1ab8cbb42d751a6f0d824e" + integrity sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA== + dependencies: + dom-serializer "^2.0.0" + domelementtype "^2.3.0" + domhandler "^5.0.3" + dotenv-expand@10.0.0: version "10.0.0" resolved "https://registry.yarnpkg.com/dotenv-expand/-/dotenv-expand-10.0.0.tgz#12605d00fb0af6d0a592e6558585784032e4ef37" @@ -2308,6 +2394,11 @@ enhanced-resolve@^5.0.0, enhanced-resolve@^5.15.0, enhanced-resolve@^5.7.0: graceful-fs "^4.2.4" tapable "^2.2.0" +entities@^4.2.0, entities@^4.4.0: + version "4.5.0" + resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48" + integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw== + error-ex@^1.3.1: version "1.3.2" resolved "https://registry.yarnpkg.com/error-ex/-/error-ex-1.3.2.tgz#b4ac40648107fdcdcfae242f428bea8a14d4f1bf" @@ -3016,6 +3107,16 @@ html-escaper@^2.0.0: resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== +htmlparser2@^8.0.1: + version "8.0.2" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21" + integrity sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA== + dependencies: + domelementtype "^2.3.0" + domhandler "^5.0.3" + domutils "^3.0.1" + entities "^4.4.0" + http-errors@2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/http-errors/-/http-errors-2.0.0.tgz#b7774a1486ef73cf7667ac9ae0858c012c57b9d3" @@ -4239,6 +4340,13 @@ npmlog@^5.0.1: gauge "^3.0.0" set-blocking "^2.0.0" +nth-check@^2.0.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d" + integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w== + dependencies: + boolbase "^1.0.0" + object-assign@^4, object-assign@^4.1.1: version "4.1.1" resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863" @@ -4277,6 +4385,16 @@ onetime@^6.0.0: dependencies: mimic-fn "^4.0.0" +open-graph-scraper@^6.3.0: + version "6.3.0" + resolved "https://registry.yarnpkg.com/open-graph-scraper/-/open-graph-scraper-6.3.0.tgz#aecac3dea4e206a43e1d2659bbb114cbeb25f45e" + integrity sha512-Kl8Xkigvw20ZppXt1m2RCucGo/L3ZsgtThZyE5CS4GJr5MrxDz7wQN7iaO/wkM3JuiP5XzblZ3gIpLIsC9dhQw== + dependencies: + chardet "^2.0.0" + cheerio "^1.0.0-rc.12" + undici "^5.25.4" + validator "^13.11.0" + open@^9.1.0: version "9.1.0" resolved "https://registry.yarnpkg.com/open/-/open-9.1.0.tgz#684934359c90ad25742f5a26151970ff8c6c80b6" @@ -4377,6 +4495,21 @@ parse-json@^5.0.0, parse-json@^5.2.0: json-parse-even-better-errors "^2.3.0" lines-and-columns "^1.1.6" +parse5-htmlparser2-tree-adapter@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz#23c2cc233bcf09bb7beba8b8a69d46b08c62c2f1" + integrity sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g== + dependencies: + domhandler "^5.0.2" + parse5 "^7.0.0" + +parse5@^7.0.0: + version "7.1.2" + resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.1.2.tgz#0736bebbfd77793823240a23b7fc5e010b7f8e32" + integrity sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw== + dependencies: + entities "^4.4.0" + parseurl@~1.3.3: version "1.3.3" resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4" @@ -5328,6 +5461,13 @@ uid@2.0.2: dependencies: "@lukeed/csprng" "^1.0.0" +undici@^5.25.4: + version "5.25.4" + resolved "https://registry.yarnpkg.com/undici/-/undici-5.25.4.tgz#7d8ef81d94f84cd384986271e5e5599b6dff4296" + integrity sha512-450yJxT29qKMf3aoudzFpIciqpx6Pji3hEWaXqXmanbXF58LTAGCKxcJjxMXWu3iG+Mudgo3ZUfDB6YDFd/dAw== + dependencies: + "@fastify/busboy" "^2.0.0" + universalify@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/universalify/-/universalify-2.0.0.tgz#75a4984efedc4b08975c5aeb73f530d02df25717" @@ -5398,6 +5538,11 @@ v8-to-istanbul@^9.0.1: "@types/istanbul-lib-coverage" "^2.0.1" convert-source-map "^1.6.0" +validator@^13.11.0: + version "13.11.0" + resolved "https://registry.yarnpkg.com/validator/-/validator-13.11.0.tgz#23ab3fd59290c61248364eabf4067f04955fbb1b" + integrity sha512-Ii+sehpSfZy+At5nPdnyMhx78fEoPDkR2XW/zimHEL3MyGJQOCQ7WeP20jPYRz7ZCpcKLB21NxuXHF3bxjStBQ== + vary@^1, vary@~1.1.2: version "1.1.2" resolved "https://registry.yarnpkg.com/vary/-/vary-1.1.2.tgz#2299f02c6ded30d4a5961b0b9f74524a18f634fc"