Modules

`load_likes(data_directory)` ¶

Load 'like' data from JavaScript files in a given directory.

This function searches for files matching the pattern 'like*.js' within the specified data directory, extracts JSON data from each file, and aggregates the 'like' data into a list.

Parameters:

Name	Type	Description	Default
`data_directory`	`str`	The path to the directory containing the 'like*.js' files.	required

Returns:

Type	Description
`list[dict[str, LikeInfo]]`	list[dict[str, LikeInfo]]: A list of dictionaries containing 'like' information.

Raises:

Type	Description
`Exception`	If there is an error processing any of the files, the exception is caught, an error message is printed, and the function continues processing remaining files.

Source code in search_x_likes/list_likes_in_archive.py

def load_likes(data_directory: str) -> list[dict[str, LikeInfo]]:
    """
    Load 'like' data from JavaScript files in a given directory.

    This function searches for files matching the pattern 'like*.js' within the specified
    data directory, extracts JSON data from each file, and aggregates the 'like' data into a list.

    Args:
        data_directory (str): The path to the directory containing the 'like*.js' files.

    Returns:
        list[dict[str, LikeInfo]]: A list of dictionaries containing 'like' information.

    Raises:
        Exception: If there is an error processing any of the files, the exception is caught,
                   an error message is printed, and the function continues processing remaining files.
    """
    likes: list[dict[str, LikeInfo]] = []
    data_path: Path = Path(data_directory)
    like_files = data_path.glob("like*.js")
    for file_path in like_files:
        with file_path.open("r", encoding="utf-8") as f:
            content: str = f.read()
            try:
                json_data: str = content[content.index("=") + 1 :].strip()
                like_part: list[dict[str, LikeInfo]] = json.loads(json_data)
                likes.extend(like_part)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    return likes

`get_embedding(client, text, model='text-embedding-ada-002')` ¶

Generate embedding vectors for a text using the specified OpenAI model.

Parameters:

Name	Type	Description	Default
`client`	`openAI`	An OpenAI client object	required
`text`	`str]`	A string containing input text for which to generate embeddings.	required
`model`	`str`	The name of the embedding model to use.	`'text-embedding-ada-002'`

Returns:

Type	Description
`list[float]`	list[float]: A list of float representing an embedding vector.

Source code in search_x_likes/embed_posts.py

def get_embedding(client: openai.OpenAI, text: str, model: str = "text-embedding-ada-002") -> list[float]:
    """
    Generate embedding vectors for a text using the specified OpenAI model.

    Args:
        client (openai.openAI): An OpenAI client object
        text (str]): A string containing input text for which to generate embeddings.
        model (str, optional): The name of the embedding model to use.

    Returns:
        list[float]: A list of float representing an embedding vector.
    """
    cleaned_text = text.replace("\n", " ")
    try:
        response = client.embeddings.create(input=[cleaned_text], model=model)
        # Extract the embedding vector from the response
        embedding: list[float] = response.data[0].embedding
    except Exception as e:
        print(f"An error occurred while generating the embedding: {e}")
        raise
    finally:
        if "embedding" not in locals():
            embedding = []
    return embedding

`InputApp` ¶

Bases: App

Source code in search_x_likes/exact_search.py

class InputApp(App):
    CSS = """
    Input {
        margin: 1 1;
    }
    Label {
        margin: 1 2;
    }
    TextArea {
        margin: 1 2;
    }
    """

    def compose(self) -> ComposeResult:
        """Set up the layout."""
        # Create the Input and TextArea widgets within a Vertical container
        yield Label(f"Search in {len(likes)} posts you liked on X.")
        yield Input(
            placeholder="Enter search term...",
        )
        # yield TextArea(id="results")  # Simplified TextArea
        yield tw.Markdown(markdown="Search results will be displayed here...")

    # Explicitly handle the changed event for the input widget
    @on(Input.Changed)
    def on_input_changed(self, event: Input.Changed) -> None:
        """Handle input change events."""
        query: str = event.value.strip()
        results_widget: tw.Markdown = self.query_one(tw.Markdown)
        search: list[str] = []
        number_of_matches: int = 0
        for like_obj in likes:
            like: LikeInfo = like_obj.get("like", {})
            full_text: str = like.get("fullText", "")
            highlight_text: str = highlight_query(full_text, query)
            expanded_url: str = like.get("expandedUrl", "N/A")
            if query in highlight_text:
                search.append(f"❱ [{expanded_url}](expanded_url): " + highlight_text)
                number_of_matches += 1
                if number_of_matches > MAX_NUMBER_OF_MATCHES_SHOWN:
                    break

        results_widget.update("\n\n".join(search))

`compose()` ¶

Set up the layout.

Source code in search_x_likes/exact_search.py

def compose(self) -> ComposeResult:
    """Set up the layout."""
    # Create the Input and TextArea widgets within a Vertical container
    yield Label(f"Search in {len(likes)} posts you liked on X.")
    yield Input(
        placeholder="Enter search term...",
    )
    # yield TextArea(id="results")  # Simplified TextArea
    yield tw.Markdown(markdown="Search results will be displayed here...")

`on_input_changed(event)` ¶

Handle input change events.

Source code in search_x_likes/exact_search.py

@on(Input.Changed)
def on_input_changed(self, event: Input.Changed) -> None:
    """Handle input change events."""
    query: str = event.value.strip()
    results_widget: tw.Markdown = self.query_one(tw.Markdown)
    search: list[str] = []
    number_of_matches: int = 0
    for like_obj in likes:
        like: LikeInfo = like_obj.get("like", {})
        full_text: str = like.get("fullText", "")
        highlight_text: str = highlight_query(full_text, query)
        expanded_url: str = like.get("expandedUrl", "N/A")
        if query in highlight_text:
            search.append(f"❱ [{expanded_url}](expanded_url): " + highlight_text)
            number_of_matches += 1
            if number_of_matches > MAX_NUMBER_OF_MATCHES_SHOWN:
                break

    results_widget.update("\n\n".join(search))

`highlight_query(text, query)` ¶

Wraps every occurrence of the query string in bold Markdown in the text.

Source code in search_x_likes/exact_search.py

def highlight_query(text: str, query: str) -> str:
    """Wraps every occurrence of the query string in bold Markdown in the text."""
    # Escape special regex characters in the query to avoid issues
    query_escaped = re.escape(query)

    # Use re.sub to replace all occurrences of the query with bold Markdown
    highlighted_text = re.sub(f"({query_escaped})", r"**\1**", text, flags=re.IGNORECASE)

    return highlighted_text

`InputApp` ¶

Bases: App

Source code in search_x_likes/bm25_search.py

class InputApp(App):
    CSS = """
    Input {
        margin: 1 1;
    }
    Label {
        margin: 1 2;
    }
    TextArea {
        margin: 1 2;
    }
    """

    def compose(self) -> ComposeResult:
        """Set up the layout."""
        # Create the Input and TextArea widgets within a Vertical container
        yield Label(f"Search in {len(likes)} posts you liked on X.")
        yield Input(
            placeholder="Enter search term...",
        )
        # yield TextArea(id="results")  # Simplified TextArea
        yield tw.Markdown(markdown="Search results will be displayed here...")

    # Explicitly handle the changed event for the input widget
    @on(Input.Changed)
    def on_input_changed(self, event: Input.Changed) -> None:
        """Handle input change events."""
        query: str = event.value
        if len(query) < 4:
            return
        query = query.strip()
        query_tokens = bm25s.tokenize(query, stemmer=stemmer)
        results_widget: tw.Markdown = self.query_one(tw.Markdown)
        if len(query_tokens) < 1:  # Do not retrieve when there are no tokens (e.g. word is a stopword)
            results_widget.update("")
            return

        # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
        results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=MAX_NUMBER_OF_MATCHES_SHOWN)

        # Retrieve the found documents and update the markdown
        docs = [f"❱ {results[0, i]}" for i in range(results.shape[1])]

        results_widget.update("\n\n".join(docs))

`compose()` ¶

Set up the layout.

Source code in search_x_likes/bm25_search.py

def compose(self) -> ComposeResult:
    """Set up the layout."""
    # Create the Input and TextArea widgets within a Vertical container
    yield Label(f"Search in {len(likes)} posts you liked on X.")
    yield Input(
        placeholder="Enter search term...",
    )
    # yield TextArea(id="results")  # Simplified TextArea
    yield tw.Markdown(markdown="Search results will be displayed here...")

`on_input_changed(event)` ¶

Handle input change events.

Source code in search_x_likes/bm25_search.py

@on(Input.Changed)
def on_input_changed(self, event: Input.Changed) -> None:
    """Handle input change events."""
    query: str = event.value
    if len(query) < 4:
        return
    query = query.strip()
    query_tokens = bm25s.tokenize(query, stemmer=stemmer)
    results_widget: tw.Markdown = self.query_one(tw.Markdown)
    if len(query_tokens) < 1:  # Do not retrieve when there are no tokens (e.g. word is a stopword)
        results_widget.update("")
        return

    # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
    results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=MAX_NUMBER_OF_MATCHES_SHOWN)

    # Retrieve the found documents and update the markdown
    docs = [f"❱ {results[0, i]}" for i in range(results.shape[1])]

    results_widget.update("\n\n".join(docs))

`InputApp` ¶

Bases: App

Source code in search_x_likes/cosine_search.py

class InputApp(App):
    CSS = """
    Input {
        margin: 1 1;
    }
    Label {
        margin: 1 2;
    }
    TextArea {
        margin: 1 2;
    }
    """

    def compose(self) -> ComposeResult:
        """Set up the layout."""
        # Create the Input and TextArea widgets within a Vertical container
        yield Label(f"Search in {df.shape[0]} posts you liked on X.")
        yield Input(
            placeholder="Enter search term...",
        )
        # yield TextArea(id="results")  # Simplified TextArea
        yield tw.Markdown(markdown="Search results will be displayed here...")

    # Explicitly handle the changed event for the input widget
    # @on(Input.Changed)
    # def on_input_changed(self, event: Input.Changed) -> None:
    @on(Input.Submitted)
    def on_input_submitted(self, event: Input.Submitted) -> None:
        """Handle input submission events (when Enter is pressed)."""
        query: str = event.value
        if len(query) < 4:
            return
        query = query.strip()
        response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL)
        # Extract the embedding vector from the response
        search_embedding: list[float] = response.data[0].embedding
        results_widget: tw.Markdown = self.query_one(tw.Markdown)

        # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
        results = get_top_k_embeddings(df, "embeddings", np.array(search_embedding), k=5)

        # Retrieve the found documents and update the markdown
        docs = [f"❱ {result}" for result in results["full_text"].values]

        results_widget.update("\n\n".join(docs))

`compose()` ¶

Set up the layout.

Source code in search_x_likes/cosine_search.py

def compose(self) -> ComposeResult:
    """Set up the layout."""
    # Create the Input and TextArea widgets within a Vertical container
    yield Label(f"Search in {df.shape[0]} posts you liked on X.")
    yield Input(
        placeholder="Enter search term...",
    )
    # yield TextArea(id="results")  # Simplified TextArea
    yield tw.Markdown(markdown="Search results will be displayed here...")

`on_input_submitted(event)` ¶

Handle input submission events (when Enter is pressed).

Source code in search_x_likes/cosine_search.py

@on(Input.Submitted)
def on_input_submitted(self, event: Input.Submitted) -> None:
    """Handle input submission events (when Enter is pressed)."""
    query: str = event.value
    if len(query) < 4:
        return
    query = query.strip()
    response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL)
    # Extract the embedding vector from the response
    search_embedding: list[float] = response.data[0].embedding
    results_widget: tw.Markdown = self.query_one(tw.Markdown)

    # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
    results = get_top_k_embeddings(df, "embeddings", np.array(search_embedding), k=5)

    # Retrieve the found documents and update the markdown
    docs = [f"❱ {result}" for result in results["full_text"].values]

    results_widget.update("\n\n".join(docs))

`get_top_k_embeddings(df, embeddings_col, search_embedding, k)` ¶

Retrieves the top-k most similar embeddings from a DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing the embeddings.	required
`embeddings_col`	`str`	Column name of embeddings.	required
`search_embedding`	`ndarray`	The embedding of the search string.	required
`k`	`int`	Number of top embeddings to retrieve.	required

Returns:

Type	Description
`DataFrame`	raise EmbeddingColumnTypeError(embeddings_col)

Source code in search_x_likes/cosine_search.py

def get_top_k_embeddings(df: pd.DataFrame, embeddings_col: str, search_embedding: np.ndarray, k: int) -> pd.DataFrame:
    """
    Retrieves the top-k most similar embeddings from a DataFrame.

    Parameters:
        df (pd.DataFrame): DataFrame containing the embeddings.
        embeddings_col (str): Column name of embeddings.
        search_embedding (np.ndarray): The embedding of the search string.
        k (int): Number of top embeddings to retrieve.

    Returns:
        raise EmbeddingColumnTypeError(embeddings_col)
    """
    # Ensure the column contains numpy arrays
    if not isinstance(df[embeddings_col].iloc[0], np.ndarray):
        raise EmbeddingColumnTypeError(embeddings_col)

    embeddings = np.vstack(df[embeddings_col].to_list())

    # Compute cosine similarities
    similarities = cosine_similarity(embeddings, search_embedding.reshape(1, -1)).flatten()

    # Add similarities as a new column
    df["similarity"] = similarities

    # Get top-k rows sorted by similarity in descending order
    top_k_df = df.nlargest(k, "similarity")

    return top_k_df

Modules

load_likes(data_directory) ¶

get_embedding(client, text, model='text-embedding-ada-002') ¶

InputApp ¶

compose() ¶

on_input_changed(event) ¶

highlight_query(text, query) ¶

InputApp ¶

compose() ¶

on_input_changed(event) ¶

InputApp ¶

compose() ¶

on_input_submitted(event) ¶

get_top_k_embeddings(df, embeddings_col, search_embedding, k) ¶

`load_likes(data_directory)` ¶

`get_embedding(client, text, model='text-embedding-ada-002')` ¶

`InputApp` ¶

`compose()` ¶

`on_input_changed(event)` ¶

`highlight_query(text, query)` ¶

`InputApp` ¶

`compose()` ¶

`on_input_changed(event)` ¶

`InputApp` ¶

`compose()` ¶

`on_input_submitted(event)` ¶

`get_top_k_embeddings(df, embeddings_col, search_embedding, k)` ¶