feat: add typesense/docsearch-scraper (#11424)

This commit is contained in:
Giorgio Boa 2025-06-05 21:06:07 +02:00 committed by GitHub
parent 24c3e38c51
commit 65d5a00ef1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 73 additions and 0 deletions

24
.github/workflows/docsearch.yml vendored Normal file
View File

@ -0,0 +1,24 @@
name: Index docs to Typesense
on:
push:
branches:
- master
jobs:
index_docs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Index docs to Typesense
run: |
docker run \
-e TYPESENSE_API_KEY=${{ vars.TYPESENSE_API_KEY }} \
-e TYPESENSE_HOST="${{ vars.TYPESENSE_HOST }}" \
-e TYPESENSE_PORT="443" \
-e TYPESENSE_PROTOCOL="https" \
-e CONFIG="$(cat docs/docsearch-scraper-config.json | jq -r tostring)" \
typesense/docsearch-scraper

View File

@ -0,0 +1,49 @@
{
"index_name": "typeorm-docs",
"start_urls": [
"https://typeorm.io/"
],
"sitemap_urls": [
"https://typeorm.io/sitemap.xml"
],
"allowed_domains":["typeorm.io"],
"sitemap_alternate_links": true,
"stop_urls": [],
"selectors": {
"lvl0": {
"selector": "(//ul[contains(@class,'menu__list')]//a[contains(@class, 'menu__link menu__link--sublist menu__link--active')]/text() | //nav[contains(@class, 'navbar')]//a[contains(@class, 'navbar__link--active')]/text())[last()]",
"type": "xpath",
"global": true,
"default_value": "Documentation"
},
"lvl1": "article h1, header h1",
"lvl2": "article h2",
"lvl3": "article h3",
"lvl4": "article h4",
"lvl5": "article h5, article td:first-child",
"lvl6": "article h6",
"text": "article p, article li, article td:last-child"
},
"strip_chars": " .,;:#",
"custom_settings": {
"separatorsToIndex": "_",
"attributesForFaceting": [
"language",
"version",
"type",
"docusaurus_tag"
],
"attributesToRetrieve": [
"hierarchy",
"content",
"anchor",
"url",
"url_without_anchor",
"type"
]
},
"conversation_id": [
"833762294"
],
"nb_hits": 0
}