Skip to content

Modules

load_likes(data_directory)

Load 'like' data from JavaScript files in a given directory.

This function searches for files matching the pattern 'like*.js' within the specified data directory, extracts JSON data from each file, and aggregates the 'like' data into a list.

Parameters:

Name Type Description Default
data_directory str

The path to the directory containing the 'like*.js' files.

required

Returns:

Type Description
list[dict[str, LikeInfo]]

list[dict[str, LikeInfo]]: A list of dictionaries containing 'like' information.

Raises:

Type Description
Exception

If there is an error processing any of the files, the exception is caught, an error message is printed, and the function continues processing remaining files.

Source code in search_x_likes/list_likes_in_archive.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def load_likes(data_directory: str) -> list[dict[str, LikeInfo]]:
    """
    Load 'like' data from JavaScript files in a given directory.

    This function searches for files matching the pattern 'like*.js' within the specified
    data directory, extracts JSON data from each file, and aggregates the 'like' data into a list.

    Args:
        data_directory (str): The path to the directory containing the 'like*.js' files.

    Returns:
        list[dict[str, LikeInfo]]: A list of dictionaries containing 'like' information.

    Raises:
        Exception: If there is an error processing any of the files, the exception is caught,
                   an error message is printed, and the function continues processing remaining files.
    """
    likes: list[dict[str, LikeInfo]] = []
    data_path: Path = Path(data_directory)
    like_files = data_path.glob("like*.js")
    for file_path in like_files:
        with file_path.open("r", encoding="utf-8") as f:
            content: str = f.read()
            try:
                json_data: str = content[content.index("=") + 1 :].strip()
                like_part: list[dict[str, LikeInfo]] = json.loads(json_data)
                likes.extend(like_part)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    return likes

get_embedding(client, text, model='text-embedding-ada-002')

Generate embedding vectors for a text using the specified OpenAI model.

Parameters:

Name Type Description Default
client openAI

An OpenAI client object

required
text str]

A string containing input text for which to generate embeddings.

required
model str

The name of the embedding model to use.

'text-embedding-ada-002'

Returns:

Type Description
list[float]

list[float]: A list of float representing an embedding vector.

Source code in search_x_likes/embed_posts.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def get_embedding(client: openai.OpenAI, text: str, model: str = "text-embedding-ada-002") -> list[float]:
    """
    Generate embedding vectors for a text using the specified OpenAI model.

    Args:
        client (openai.openAI): An OpenAI client object
        text (str]): A string containing input text for which to generate embeddings.
        model (str, optional): The name of the embedding model to use.

    Returns:
        list[float]: A list of float representing an embedding vector.
    """
    cleaned_text = text.replace("\n", " ")
    try:
        response = client.embeddings.create(input=[cleaned_text], model=model)
        # Extract the embedding vector from the response
        embedding: list[float] = response.data[0].embedding
    except Exception as e:
        print(f"An error occurred while generating the embedding: {e}")
        raise
    finally:
        if "embedding" not in locals():
            embedding = []
    return embedding

InputApp

Bases: App

Source code in search_x_likes/exact_search.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class InputApp(App):
    CSS = """
    Input {
        margin: 1 1;
    }
    Label {
        margin: 1 2;
    }
    TextArea {
        margin: 1 2;
    }
    """

    def compose(self) -> ComposeResult:
        """Set up the layout."""
        # Create the Input and TextArea widgets within a Vertical container
        yield Label(f"Search in {len(likes)} posts you liked on X.")
        yield Input(
            placeholder="Enter search term...",
        )
        # yield TextArea(id="results")  # Simplified TextArea
        yield tw.Markdown(markdown="Search results will be displayed here...")

    # Explicitly handle the changed event for the input widget
    @on(Input.Changed)
    def on_input_changed(self, event: Input.Changed) -> None:
        """Handle input change events."""
        query: str = event.value.strip()
        results_widget: tw.Markdown = self.query_one(tw.Markdown)
        search: list[str] = []
        number_of_matches: int = 0
        for like_obj in likes:
            like: LikeInfo = like_obj.get("like", {})
            full_text: str = like.get("fullText", "")
            highlight_text: str = highlight_query(full_text, query)
            expanded_url: str = like.get("expandedUrl", "N/A")
            if query in highlight_text:
                search.append(f"❱ [{expanded_url}](expanded_url): " + highlight_text)
                number_of_matches += 1
                if number_of_matches > MAX_NUMBER_OF_MATCHES_SHOWN:
                    break

        results_widget.update("\n\n".join(search))

compose()

Set up the layout.

Source code in search_x_likes/exact_search.py
46
47
48
49
50
51
52
53
54
def compose(self) -> ComposeResult:
    """Set up the layout."""
    # Create the Input and TextArea widgets within a Vertical container
    yield Label(f"Search in {len(likes)} posts you liked on X.")
    yield Input(
        placeholder="Enter search term...",
    )
    # yield TextArea(id="results")  # Simplified TextArea
    yield tw.Markdown(markdown="Search results will be displayed here...")

on_input_changed(event)

Handle input change events.

Source code in search_x_likes/exact_search.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@on(Input.Changed)
def on_input_changed(self, event: Input.Changed) -> None:
    """Handle input change events."""
    query: str = event.value.strip()
    results_widget: tw.Markdown = self.query_one(tw.Markdown)
    search: list[str] = []
    number_of_matches: int = 0
    for like_obj in likes:
        like: LikeInfo = like_obj.get("like", {})
        full_text: str = like.get("fullText", "")
        highlight_text: str = highlight_query(full_text, query)
        expanded_url: str = like.get("expandedUrl", "N/A")
        if query in highlight_text:
            search.append(f"❱ [{expanded_url}](expanded_url): " + highlight_text)
            number_of_matches += 1
            if number_of_matches > MAX_NUMBER_OF_MATCHES_SHOWN:
                break

    results_widget.update("\n\n".join(search))

highlight_query(text, query)

Wraps every occurrence of the query string in bold Markdown in the text.

Source code in search_x_likes/exact_search.py
22
23
24
25
26
27
28
29
30
def highlight_query(text: str, query: str) -> str:
    """Wraps every occurrence of the query string in bold Markdown in the text."""
    # Escape special regex characters in the query to avoid issues
    query_escaped = re.escape(query)

    # Use re.sub to replace all occurrences of the query with bold Markdown
    highlighted_text = re.sub(f"({query_escaped})", r"**\1**", text, flags=re.IGNORECASE)

    return highlighted_text

InputApp

Bases: App

Source code in search_x_likes/bm25_search.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class InputApp(App):
    CSS = """
    Input {
        margin: 1 1;
    }
    Label {
        margin: 1 2;
    }
    TextArea {
        margin: 1 2;
    }
    """

    def compose(self) -> ComposeResult:
        """Set up the layout."""
        # Create the Input and TextArea widgets within a Vertical container
        yield Label(f"Search in {len(likes)} posts you liked on X.")
        yield Input(
            placeholder="Enter search term...",
        )
        # yield TextArea(id="results")  # Simplified TextArea
        yield tw.Markdown(markdown="Search results will be displayed here...")

    # Explicitly handle the changed event for the input widget
    @on(Input.Changed)
    def on_input_changed(self, event: Input.Changed) -> None:
        """Handle input change events."""
        query: str = event.value
        if len(query) < 4:
            return
        query = query.strip()
        query_tokens = bm25s.tokenize(query, stemmer=stemmer)
        results_widget: tw.Markdown = self.query_one(tw.Markdown)
        if len(query_tokens) < 1:  # Do not retrieve when there are no tokens (e.g. word is a stopword)
            results_widget.update("")
            return

        # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
        results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=MAX_NUMBER_OF_MATCHES_SHOWN)

        # Retrieve the found documents and update the markdown
        docs = [f"❱ {results[0, i]}" for i in range(results.shape[1])]

        results_widget.update("\n\n".join(docs))

compose()

Set up the layout.

Source code in search_x_likes/bm25_search.py
36
37
38
39
40
41
42
43
44
def compose(self) -> ComposeResult:
    """Set up the layout."""
    # Create the Input and TextArea widgets within a Vertical container
    yield Label(f"Search in {len(likes)} posts you liked on X.")
    yield Input(
        placeholder="Enter search term...",
    )
    # yield TextArea(id="results")  # Simplified TextArea
    yield tw.Markdown(markdown="Search results will be displayed here...")

on_input_changed(event)

Handle input change events.

Source code in search_x_likes/bm25_search.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@on(Input.Changed)
def on_input_changed(self, event: Input.Changed) -> None:
    """Handle input change events."""
    query: str = event.value
    if len(query) < 4:
        return
    query = query.strip()
    query_tokens = bm25s.tokenize(query, stemmer=stemmer)
    results_widget: tw.Markdown = self.query_one(tw.Markdown)
    if len(query_tokens) < 1:  # Do not retrieve when there are no tokens (e.g. word is a stopword)
        results_widget.update("")
        return

    # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
    results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=MAX_NUMBER_OF_MATCHES_SHOWN)

    # Retrieve the found documents and update the markdown
    docs = [f"❱ {results[0, i]}" for i in range(results.shape[1])]

    results_widget.update("\n\n".join(docs))

InputApp

Bases: App

Source code in search_x_likes/cosine_search.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class InputApp(App):
    CSS = """
    Input {
        margin: 1 1;
    }
    Label {
        margin: 1 2;
    }
    TextArea {
        margin: 1 2;
    }
    """

    def compose(self) -> ComposeResult:
        """Set up the layout."""
        # Create the Input and TextArea widgets within a Vertical container
        yield Label(f"Search in {df.shape[0]} posts you liked on X.")
        yield Input(
            placeholder="Enter search term...",
        )
        # yield TextArea(id="results")  # Simplified TextArea
        yield tw.Markdown(markdown="Search results will be displayed here...")

    # Explicitly handle the changed event for the input widget
    # @on(Input.Changed)
    # def on_input_changed(self, event: Input.Changed) -> None:
    @on(Input.Submitted)
    def on_input_submitted(self, event: Input.Submitted) -> None:
        """Handle input submission events (when Enter is pressed)."""
        query: str = event.value
        if len(query) < 4:
            return
        query = query.strip()
        response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL)
        # Extract the embedding vector from the response
        search_embedding: list[float] = response.data[0].embedding
        results_widget: tw.Markdown = self.query_one(tw.Markdown)

        # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
        results = get_top_k_embeddings(df, "embeddings", np.array(search_embedding), k=5)

        # Retrieve the found documents and update the markdown
        docs = [f"❱ {result}" for result in results["full_text"].values]

        results_widget.update("\n\n".join(docs))

compose()

Set up the layout.

Source code in search_x_likes/cosine_search.py
66
67
68
69
70
71
72
73
74
def compose(self) -> ComposeResult:
    """Set up the layout."""
    # Create the Input and TextArea widgets within a Vertical container
    yield Label(f"Search in {df.shape[0]} posts you liked on X.")
    yield Input(
        placeholder="Enter search term...",
    )
    # yield TextArea(id="results")  # Simplified TextArea
    yield tw.Markdown(markdown="Search results will be displayed here...")

on_input_submitted(event)

Handle input submission events (when Enter is pressed).

Source code in search_x_likes/cosine_search.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@on(Input.Submitted)
def on_input_submitted(self, event: Input.Submitted) -> None:
    """Handle input submission events (when Enter is pressed)."""
    query: str = event.value
    if len(query) < 4:
        return
    query = query.strip()
    response = client.embeddings.create(input=[query], model=EMBEDDING_MODEL)
    # Extract the embedding vector from the response
    search_embedding: list[float] = response.data[0].embedding
    results_widget: tw.Markdown = self.query_one(tw.Markdown)

    # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
    results = get_top_k_embeddings(df, "embeddings", np.array(search_embedding), k=5)

    # Retrieve the found documents and update the markdown
    docs = [f"❱ {result}" for result in results["full_text"].values]

    results_widget.update("\n\n".join(docs))

get_top_k_embeddings(df, embeddings_col, search_embedding, k)

Retrieves the top-k most similar embeddings from a DataFrame.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the embeddings.

required
embeddings_col str

Column name of embeddings.

required
search_embedding ndarray

The embedding of the search string.

required
k int

Number of top embeddings to retrieve.

required

Returns:

Type Description
DataFrame

raise EmbeddingColumnTypeError(embeddings_col)

Source code in search_x_likes/cosine_search.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def get_top_k_embeddings(df: pd.DataFrame, embeddings_col: str, search_embedding: np.ndarray, k: int) -> pd.DataFrame:
    """
    Retrieves the top-k most similar embeddings from a DataFrame.

    Parameters:
        df (pd.DataFrame): DataFrame containing the embeddings.
        embeddings_col (str): Column name of embeddings.
        search_embedding (np.ndarray): The embedding of the search string.
        k (int): Number of top embeddings to retrieve.

    Returns:
        raise EmbeddingColumnTypeError(embeddings_col)
    """
    # Ensure the column contains numpy arrays
    if not isinstance(df[embeddings_col].iloc[0], np.ndarray):
        raise EmbeddingColumnTypeError(embeddings_col)

    embeddings = np.vstack(df[embeddings_col].to_list())

    # Compute cosine similarities
    similarities = cosine_similarity(embeddings, search_embedding.reshape(1, -1)).flatten()

    # Add similarities as a new column
    df["similarity"] = similarities

    # Get top-k rows sorted by similarity in descending order
    top_k_df = df.nlargest(k, "similarity")

    return top_k_df