gitlab-foss/create_index.py at 11f0927940e0d42ac733b7e952edc7c338a08e9b

mirror of https://gitlab.com/gitlab-org/gitlab-foss.git synced 2025-07-25 16:03:48 +00:00

Files

GitLab Bot 539748dfe7 Add latest changes from gitlab-org/gitlab@master

2025-03-14 09:12:34 +00:00

204 lines

7.3 KiB

Python

Raw Blame History

 # Usage
 # 1. Install requirements:
 # pip install requests langchain langchain_text_splitters
 # 2. Run the script:
 # GLAB_TOKEN=<api_token> python3 scripts/custom_models/create_index.py --version_tag="v17.0.0"
 import argparse
 import glob
 import os
 import datetime
 import re
 import sqlite3
 import sys
 import requests
 import json
 from zipfile import ZipFile
 from langchain.docstore.document import Document
 from langchain_text_splitters import MarkdownHeaderTextSplitter
 import tempfile
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Function to parse command-line arguments
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Generate GitLab docs index.")
     parser.add_argument("-o", "--output_path",
                         help="Output path",
                         default="docs.db")
     parser.add_argument("-d", "--download",
                             help="Downloads GitLab docs from a reference. If disabled, assumes docs are in /docs",
                             action='store_true')
     parser.add_argument("--version_tag",
                         help="GitLab version tag to include in the URL (e.g., v17.1.0-ee). Only used when -d is set",
                         default="master")
     parser.add_argument("-u", "--upload",
                         help='''Uploads documentation as a generic package to a registry defined by project_id and base_url.
                              Requires GLAB_TOKEN to be defined with a GitLab PAT with api scope''',
                         action='store_true')
     parser.add_argument("--base_url",
                         help="URL to gitlab instance  uploading. Only used when -u is set",
                         default="https://gitlab.com")
     parser.add_argument("--project_id",
                         help="GitLab project ID. Only used when -u is set.",
                         default=278964)
     return parser.parse_args()
 def execution_error(error_message):
     logger.error(error_message)
     sys.exit(1)
 # Function to fetch documents from GitLab
 def fetch_documents(version_tag):
     docs_url = f"https://gitlab.com/gitlab-org/gitlab/-/archive/{version_tag}/gitlab-{version_tag}.zip?path=doc"
     response = requests.get(docs_url)
     if response.status_code == 200:
         tmpdirname = tempfile.mkdtemp()
         zip_path = os.path.join(tmpdirname, "docs.zip")
         with open(zip_path, 'wb') as f:
             f.write(response.content)
         with ZipFile(zip_path, 'r') as zip_ref:
             zip_ref.extractall(tmpdirname)
         # Find the directory that was extracted
         extracted_dirs = [os.path.join(tmpdirname, name) for name in os.listdir(tmpdirname) if os.path.isdir(os.path.join(tmpdirname, name))]
         if not extracted_dirs:
             execution_error("No directory found after extraction. Exiting.")
         logger.info("Documents are fetched.")
         extracted_dir = extracted_dirs[0]
         logger.info(f"Extracted documents to {extracted_dir}")
         return extracted_dir
     else:
         execution_error(f"Failed to download documents. Status code: {response.status_code}")
 def upload_url(base_url, project_id, version_tag):
     return f"{base_url}/api/v4/projects/{project_id}/packages/generic/gitlab-duo-local-documentation-index/{version_tag}/docs.db"
 def build_row_corpus(row):
     corpus = row['content']
     # Remove the preamble
     preamble_start = corpus.find('---')
     if preamble_start != -1:
         preamble_end = corpus.find('---', preamble_start + 1)
         corpus = corpus[preamble_end + 2:-1]
     if not corpus:
         return ''
     # Attach the titles to the corpus, these can still be useful
     corpus = ''.join(row['metadata'].get(f"Header{i}", '') for i in range(1, 6)) + ' ' + corpus
     # Stemming could be helpful, but it is already applied by the sqlite
     # Remove punctuation and set to lowercase, this should reduce the size of the corpus and allow
     # the query to be a bit more robust
     corpus = corpus.lower()
     corpus = re.sub(r'[^\w\s]', '', corpus)
     return corpus
 # Function to process documents and create the database
 def create_database(path, output_path):
     files = glob.glob(os.path.join(path, "doc/**/*.md"), recursive=True)
     if not files:
         execution_error("No markdown files found")
     documents = []
     # Read all the files
     for filename in files:
         with open(filename, "r") as f:
             doc = Document(
                 page_content=f.read(),
                 metadata={"filename": filename}
             )
             documents.append(doc)
     # Split content into chunks by its header
     headers_to_split_on = [
         ("#", "Header1"),
         ("##", "Header2"),
         ("###", "Header3"),
         ("####", "Header4"),
         ("#####", "Header5"),
     ]
     markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
     rows_to_insert = []
     for d in documents:
         md_header_splits = markdown_splitter.split_text(d.page_content)
         for chunk in md_header_splits:
             metadata = {**chunk.metadata, **d.metadata}
             rows_to_insert.append({"content": chunk.page_content, "metadata": metadata})
     for r in rows_to_insert:
         r['processed'] = build_row_corpus(r)
     # sql_tuples = [(r['processed'], r['content'], r['metadata']['filename']) for r in rows_to_insert if r['processed']]
     sql_tuples = [(r['processed'], r['content'], json.dumps(r['metadata'])) for r in rows_to_insert if r['processed']]
     if os.path.exists(output_path):
         os.remove(output_path)
         logger.info(f"Deleted existing file at {output_path}")
     # Create the database
     conn = sqlite3.connect(output_path)
     c = conn.cursor()
     c.execute("CREATE VIRTUAL TABLE doc_index USING fts5(processed, content, metadata, tokenize='porter trigram');")
     c.execute("PRAGMA user_version = 1;")
     c.executemany('INSERT INTO doc_index (processed, content, metadata) VALUES (?,?,?)', sql_tuples)
     conn.commit()
     conn.close()
 # Function to upload the database file to GitLab package registry
 def upload_to_gitlab(upload_url, file_path, private_token):
     headers = {"Authorization": f"Bearer {private_token}"}
     with open(file_path, 'rb') as f:
         files = {"file": f}
         response = requests.put(upload_url, headers=headers, files=files)
     if response.status_code in {200, 201}:
         logger.info("Database uploaded successfully.")
     else:
         logger.error(f"Upload failed with status code: {response.status_code}, response: {response.content}")
 if __name__ == "__main__":
     args = parse_arguments()
     if args.upload:
         private_token = os.environ['GLAB_TOKEN']
         if not private_token:
             execution_error("Private token must be set.")
     if args.download:
         docs_path = fetch_documents(version_tag=args.version_tag)
         if not docs_path:
             execution_error("Fetching documents failed")
     else:
         docs_path = ''
     output_path = args.output_path
     create_database(docs_path, output_path)
     logger.info(f"Database created at {output_path}")
     if args.upload:
         # Upload to GitLab
         if not os.path.exists(output_path):
             execution_error("Database file not found.")
         url = upload_url(args.base_url, args.project_id, args.version_tag)
         logger.info(f"Uploading to {url}")
         upload_to_gitlab(url, output_path, private_token)

204 lines 7.3 KiB Python Raw Blame History

204 lines

7.3 KiB

Python

Raw Blame History