From d2fb92670d3ef5c699903fc3acacd231ec51455d Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 28 May 2025 15:16:07 -0500 Subject: [PATCH 01/23] Updates test snapshot --- r/man/llm_use.Rd | 1 - r/tests/testthat/_snaps/llm-use.md | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/r/man/llm_use.Rd b/r/man/llm_use.Rd index 2ae25fb..fd84f80 100644 --- a/r/man/llm_use.Rd +++ b/r/man/llm_use.Rd @@ -69,6 +69,5 @@ llm_use(.silent = TRUE) library(ellmer) chat <- chat_openai(model = "gpt-4o") llm_use(chat) - } } diff --git a/r/tests/testthat/_snaps/llm-use.md b/r/tests/testthat/_snaps/llm-use.md index b18e5ec..c1c0581 100644 --- a/r/tests/testthat/_snaps/llm-use.md +++ b/r/tests/testthat/_snaps/llm-use.md @@ -26,7 +26,7 @@ -- mall session object Backend: ellmer - LLM session: model:gpt-4o + LLM session: model:gpt-4.1 # Ensures empty llm_use works with Chat @@ -36,5 +36,5 @@ -- mall session object Backend: ellmer - LLM session: model:gpt-4o + LLM session: model:gpt-4.1 From 320700e16e49aeaf0978b2ef2ae6a38d6dd964d3 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:58:12 -0500 Subject: [PATCH 02/23] Moves use function out of polars --- python/mall/llm.py | 22 +++++++++++++++++++++- python/mall/polars.py | 26 ++++++++++---------------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index de932a8..3b478a2 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -1,9 +1,29 @@ +from ollama import Client +from chatlas import Chat import polars as pl +import hashlib import ollama import json -import hashlib import os +def use_llm(backend="", model="", _cache="_mall_cache", **kwargs): + out = dict() + if isinstance(backend, Chat): + out.update(dict(backend="chatlas")) + out.update(dict(chat=backend)) + backend = "" + model = "" + if isinstance(backend, Client): + out.update(dict(backend="ollama-client")) + out.update(dict(client=backend)) + backend = "" + if backend != "": + out.update(dict(backend=backend)) + if model != "": + out.update(dict(model=model)) + out.update(dict(_cache=_cache)) + out.update(dict(kwargs)) + return out def map_call(df, col, msg, pred_name, use, valid_resps="", convert=None): if valid_resps == "": diff --git a/python/mall/polars.py b/python/mall/polars.py index 6202cc9..f905220 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -11,7 +11,10 @@ custom, verify, ) -from mall.llm import map_call +from mall.llm import ( + use_llm, + map_call + ) @pl.api.register_dataframe_namespace("llm") @@ -95,21 +98,12 @@ def use(self, backend="", model="", _cache="_mall_cache", **kwargs): reviews.llm.use(chat) ``` """ - if isinstance(backend, Chat): - self._use.update(dict(backend="chatlas")) - self._use.update(dict(chat=backend)) - backend = "" - model = "" - if isinstance(backend, Client): - self._use.update(dict(backend="ollama-client")) - self._use.update(dict(client=backend)) - backend = "" - if backend != "": - self._use.update(dict(backend=backend)) - if model != "": - self._use.update(dict(model=model)) - self._use.update(dict(_cache=_cache)) - self._use.update(dict(kwargs)) + self._use = use_llm( + backend=backend, + model=model, + _cache=_cache, + **kwargs + ) return self._use def sentiment( From 0eb41af8b2549c1dea379cdd8e2ed9051089004e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 4 Jun 2025 15:40:17 -0500 Subject: [PATCH 03/23] Standarizes llm functions --- python/mall/llm.py | 4 ++-- python/mall/polars.py | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 3b478a2..4fc71c8 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -6,7 +6,7 @@ import json import os -def use_llm(backend="", model="", _cache="_mall_cache", **kwargs): +def llm_use(backend="", model="", _cache="_mall_cache", **kwargs): out = dict() if isinstance(backend, Chat): out.update(dict(backend="chatlas")) @@ -25,7 +25,7 @@ def use_llm(backend="", model="", _cache="_mall_cache", **kwargs): out.update(dict(kwargs)) return out -def map_call(df, col, msg, pred_name, use, valid_resps="", convert=None): +def llm_map(df, col, msg, pred_name, use, valid_resps="", convert=None): if valid_resps == "": valid_resps = [] valid_resps = valid_output(valid_resps) diff --git a/python/mall/polars.py b/python/mall/polars.py index f905220..aeded71 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -12,8 +12,8 @@ verify, ) from mall.llm import ( - use_llm, - map_call + llm_use, + llm_map ) @@ -98,7 +98,7 @@ def use(self, backend="", model="", _cache="_mall_cache", **kwargs): reviews.llm.use(chat) ``` """ - self._use = use_llm( + self._use = llm_use( backend=backend, model=model, _cache=_cache, @@ -155,7 +155,7 @@ def sentiment( ``` """ - df = map_call( + df = llm_map( df=self._df, col=col, msg=sentiment(options, additional=additional), @@ -202,7 +202,7 @@ def summarize( reviews.llm.summarize("review", 5, pred_name = "review_summary") ``` """ - df = map_call( + df = llm_map( df=self._df, col=col, msg=summarize(max_words, additional=additional), @@ -248,7 +248,7 @@ def translate( ``` """ - df = map_call( + df = llm_map( df=self._df, col=col, msg=translate(language, additional=additional), @@ -300,7 +300,7 @@ def classify( reviews.llm.classify("review", {"appliance" : "1", "computer" : "2"}) ``` """ - df = map_call( + df = llm_map( df=self._df, col=col, msg=classify(labels, additional=additional), @@ -384,7 +384,7 @@ def extract( for label in labels: lab_names.append(label) lab_vals.append(labels[label]) - df = map_call( + df = llm_map( df=self._df, col=col, msg=extract(lab_vals, additional=additional), @@ -436,7 +436,7 @@ def custom( reviews.llm.custom("review", prompt = my_prompt) ``` """ - df = map_call( + df = llm_map( df=self._df, col=col, msg=custom(prompt), @@ -489,7 +489,7 @@ def verify( reviews.llm.verify("review", "is the customer happy", ["y", "n"]) ``` """ - df = map_call( + df = llm_map( df=self._df, col=col, msg=verify(what, additional=additional), From 6a604b2ae2c8453bfeaae4512f43af35aba3d152 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 09:32:39 -0500 Subject: [PATCH 04/23] Starts building LlmVec --- python/mall/__init__.py | 1 + python/mall/llm.py | 11 +++++++++++ python/mall/llmvec.py | 27 +++++++++++++++++++++++++++ python/mall/polars.py | 12 ++---------- 4 files changed, 41 insertions(+), 10 deletions(-) create mode 100644 python/mall/llmvec.py diff --git a/python/mall/__init__.py b/python/mall/__init__.py index e623439..84bc232 100644 --- a/python/mall/__init__.py +++ b/python/mall/__init__.py @@ -2,3 +2,4 @@ from mall.polars import MallFrame from mall.data import MallData +from mall.llmvec import LlmVec diff --git a/python/mall/llm.py b/python/mall/llm.py index 4fc71c8..300edeb 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -6,6 +6,7 @@ import json import os + def llm_use(backend="", model="", _cache="_mall_cache", **kwargs): out = dict() if isinstance(backend, Chat): @@ -25,6 +26,7 @@ def llm_use(backend="", model="", _cache="_mall_cache", **kwargs): out.update(dict(kwargs)) return out + def llm_map(df, col, msg, pred_name, use, valid_resps="", convert=None): if valid_resps == "": valid_resps = [] @@ -58,6 +60,15 @@ def llm_map(df, col, msg, pred_name, use, valid_resps="", convert=None): return df +def llm_loop(x, msg, use, valid_resps="", convert=None): + out = list() + for row in x: + out.append( + llm_call(x=row, msg=msg, use=use, valid_resps=valid_resps, convert=convert) + ) + return out + + def llm_call(x, msg, use, valid_resps="", convert=None, data_type=None): backend = use.get("backend") diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py new file mode 100644 index 0000000..07b0bac --- /dev/null +++ b/python/mall/llmvec.py @@ -0,0 +1,27 @@ +from mall.prompt import ( + sentiment, + summarize, + translate, + classify, + extract, + custom, + verify, +) + +from mall.llm import llm_use, llm_loop + + +class LlmVec: + def __init__(self, backend="", model="", _cache="_mall_cache", **kwargs): + self._use = llm_use(backend=backend, model=model, _cache=_cache, **kwargs) + + def sentiment( + self, x, options=["positive", "negative", "neutral"], additional="" + ) -> list: + out = llm_loop( + x=x, + msg=sentiment(options, additional=additional), + use=self._use, + valid_resps=options, + ) + return out diff --git a/python/mall/polars.py b/python/mall/polars.py index aeded71..d7ad452 100644 --- a/python/mall/polars.py +++ b/python/mall/polars.py @@ -11,10 +11,7 @@ custom, verify, ) -from mall.llm import ( - llm_use, - llm_map - ) +from mall.llm import llm_use, llm_map @pl.api.register_dataframe_namespace("llm") @@ -98,12 +95,7 @@ def use(self, backend="", model="", _cache="_mall_cache", **kwargs): reviews.llm.use(chat) ``` """ - self._use = llm_use( - backend=backend, - model=model, - _cache=_cache, - **kwargs - ) + self._use = llm_use(backend=backend, model=model, _cache=_cache, **kwargs) return self._use def sentiment( From 3c19e4f364d4df98c6162e770af4ff69bde25513 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 09:43:56 -0500 Subject: [PATCH 05/23] Adds summarize --- python/mall/llmvec.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 07b0bac..34161ed 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -18,10 +18,16 @@ def __init__(self, backend="", model="", _cache="_mall_cache", **kwargs): def sentiment( self, x, options=["positive", "negative", "neutral"], additional="" ) -> list: - out = llm_loop( + return llm_loop( x=x, msg=sentiment(options, additional=additional), use=self._use, valid_resps=options, ) - return out + + def summarize(self, x, max_words=10, additional="") -> list: + return llm_loop( + x=x, + msg=summarize(max_words, additional=additional), + use=self._use, + ) From 35e82f2628f399f727c9df1f221d4a7a1c3a5f7e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 13:22:57 -0500 Subject: [PATCH 06/23] Adds translate and classify --- python/mall/llmvec.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 34161ed..622c482 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -10,7 +10,6 @@ from mall.llm import llm_use, llm_loop - class LlmVec: def __init__(self, backend="", model="", _cache="_mall_cache", **kwargs): self._use = llm_use(backend=backend, model=model, _cache=_cache, **kwargs) @@ -31,3 +30,18 @@ def summarize(self, x, max_words=10, additional="") -> list: msg=summarize(max_words, additional=additional), use=self._use, ) + + def translate(self, x, language="", additional="") -> list: + return llm_loop( + x=x, + msg=translate(language, additional=additional), + use=self._use, + ) + + def classify(self, x, labels="", additional="") -> list: + return llm_loop( + x=x, + msg=classify(labels, additional=additional), + use=self._use, + valid_resps=labels + ) From 9c7968b580d99252f1961c61258441ac58345687 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:27:35 -0500 Subject: [PATCH 07/23] Adds extract, custom and verify --- python/mall/llmvec.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 622c482..0224cc3 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -10,6 +10,7 @@ from mall.llm import llm_use, llm_loop + class LlmVec: def __init__(self, backend="", model="", _cache="_mall_cache", **kwargs): self._use = llm_use(backend=backend, model=model, _cache=_cache, **kwargs) @@ -42,6 +43,21 @@ def classify(self, x, labels="", additional="") -> list: return llm_loop( x=x, msg=classify(labels, additional=additional), - use=self._use, - valid_resps=labels + use=self._use, + valid_resps=labels, + ) + + def extract(self, x, labels="", additional="") -> list: + return llm_loop(x=x, msg=extract(labels, additional=additional), use=self._use) + + def custom(self, x, prompt="", valid_resps="") -> list: + return llm_loop(x=x, msg=custom(prompt), use=self._use, valid_resps=labels) + + def verify(self, x, what="", yes_no=[1, 0], additional="") -> list: + return llm_loop( + x=x, + msg=verify(what, additional=additional), + use=self._use, + valid_resps=yes_no, + convert=dict(yes=yes_no[0], no=yes_no[1]), ) From 081af9d247661fef429c0a5996097df60a580868 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:55:47 -0500 Subject: [PATCH 08/23] Starts adding documentation --- python/mall/llm.py | 2 ++ python/mall/llmvec.py | 49 +++++++++++++++++++++++++++++++++++++++++++ python/pyproject.toml | 2 +- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/python/mall/llm.py b/python/mall/llm.py index 300edeb..b327f1a 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -61,6 +61,8 @@ def llm_map(df, col, msg, pred_name, use, valid_resps="", convert=None): def llm_loop(x, msg, use, valid_resps="", convert=None): + if isinstance(x, list) == False: + raise TypeError("`x` is not a list object") out = list() for row in x: out.append( diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 0224cc3..983da3d 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -12,12 +12,43 @@ class LlmVec: + """Class that adds ability to use an LLM to run batch predictions + + ```{python} + from chatlas import ChatOllama + from mall import LlmVec + + chat = ChatOllama(model = "llama3.2") + + llm = LlmVec(chat) + ``` + """ def __init__(self, backend="", model="", _cache="_mall_cache", **kwargs): self._use = llm_use(backend=backend, model=model, _cache=_cache, **kwargs) def sentiment( self, x, options=["positive", "negative", "neutral"], additional="" ) -> list: + """Use an LLM to run a sentiment analysis + + Parameters + ------ + x : list + A list of texts + + options : list or dict + A list of the sentiment options to use, or a named DICT + object + + additional : str + Inserts this text into the prompt sent to the LLM + + Examples + ------ + + llm.sentiment(['I am happy', 'I am sad']) + + """ return llm_loop( x=x, msg=sentiment(options, additional=additional), @@ -26,6 +57,24 @@ def sentiment( ) def summarize(self, x, max_words=10, additional="") -> list: + """Summarize the text down to a specific number of words. + + Parameters + ------ + x : list + A list of texts + + max_words : int + Maximum number of words to use for the summary + + additional : str + Inserts this text into the prompt sent to the LLM + + Examples + ------ + + llm.summarize('This has been the best TV I've ever used. Great screen, and sound.', max_words = 5) + """ return llm_loop( x=x, msg=summarize(max_words, additional=additional), diff --git a/python/pyproject.toml b/python/pyproject.toml index 5807592..9a69fb8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ packages = ["mall"] [project] name = "mlverse-mall" -version = "0.1.0.9001" +version = "0.1.0.9002" description = "Run multiple 'Large Language Model' predictions against a table. The predictions run row-wise over a specified column." readme = "README.md" authors = [ From 5c858ad838e64e31094654c9b0035b204bbf513f Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 17:05:02 -0500 Subject: [PATCH 09/23] First pass at full documentation --- python/mall/llmvec.py | 101 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 983da3d..79629d2 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -46,8 +46,9 @@ def sentiment( Examples ------ + ```{python} llm.sentiment(['I am happy', 'I am sad']) - + ``` """ return llm_loop( x=x, @@ -73,7 +74,9 @@ def summarize(self, x, max_words=10, additional="") -> list: Examples ------ + ```{python} llm.summarize('This has been the best TV I've ever used. Great screen, and sound.', max_words = 5) + ``` """ return llm_loop( x=x, @@ -82,6 +85,28 @@ def summarize(self, x, max_words=10, additional="") -> list: ) def translate(self, x, language="", additional="") -> list: + """Translate text into another language. + + Parameters + ------ + x : list + A list of texts + + language : str + The target language to translate to. For example 'French'. + + additional : str + Inserts this text into the prompt sent to the LLM + + + Examples + ------ + + ```{python} + llm.summarize('This has been the best TV I've ever used. Great screen, and sound.', language = 'spanish') + ``` + + """ return llm_loop( x=x, msg=translate(language, additional=additional), @@ -89,6 +114,28 @@ def translate(self, x, language="", additional="") -> list: ) def classify(self, x, labels="", additional="") -> list: + """Classify text into specific categories. + + Parameters + ------ + x : list + A list of texts + + labels : list + A list or a DICT object that defines the categories to + classify the text as. It will return one of the provided + labels. + + additional : str + Inserts this text into the prompt sent to the LLM + + Examples + ------ + + ```{python} + llm.classify(["this is important!", "there is no rush"], ["urgent", "not urgent"]) + ``` + """ return llm_loop( x=x, msg=classify(labels, additional=additional), @@ -97,12 +144,64 @@ def classify(self, x, labels="", additional="") -> list: ) def extract(self, x, labels="", additional="") -> list: + """Pull a specific label from the text. + + Parameters + ------ + x : list + A list of texts + + labels : list + A list or a DICT object that defines tells the LLM what + to look for and return + + additional : str + Inserts this text into the prompt sent to the LLM + + Examples + ------ + + ```{python} + llm.extract(["bob smith, 123 3rd street"], labels=["name", "address"]) + ``` + """ return llm_loop(x=x, msg=extract(labels, additional=additional), use=self._use) def custom(self, x, prompt="", valid_resps="") -> list: + """Provide the full prompt that the LLM will process. + + Parameters + ------ + x : list + A list of texts + + prompt : str + The prompt to send to the LLM along with the `col` + + """ return llm_loop(x=x, msg=custom(prompt), use=self._use, valid_resps=labels) def verify(self, x, what="", yes_no=[1, 0], additional="") -> list: + """Check to see if something is true about the text. + + Parameters + ------ + x : list + A list of texts + + what : str + The statement or question that needs to be verified against the + provided text + + yes_no : list + A positional list of size 2, which contains the values to return + if true and false. The first position will be used as the 'true' + value, and the second as the 'false' value + + additional : str + Inserts this text into the prompt sent to the LLM + + """ return llm_loop( x=x, msg=verify(what, additional=additional), From 5607e6606e193e635aad4f4ed36b0f1a742511a0 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 5 Jun 2025 17:23:51 -0500 Subject: [PATCH 10/23] Starts updating the site --- .../LlmVec/execute-results/html.json | 12 ++ _quarto.yml | 7 +- objects.json | 2 +- python/mall/llmvec.py | 8 +- reference/LlmVec.qmd | 154 ++++++++++++++++++ reference/MallFrame.qmd | 14 +- reference/_api_index.qmd | 12 +- 7 files changed, 194 insertions(+), 15 deletions(-) create mode 100644 _freeze/reference/LlmVec/execute-results/html.json create mode 100644 reference/LlmVec.qmd diff --git a/_freeze/reference/LlmVec/execute-results/html.json b/_freeze/reference/LlmVec/execute-results/html.json new file mode 100644 index 0000000..b23b7e6 --- /dev/null +++ b/_freeze/reference/LlmVec/execute-results/html.json @@ -0,0 +1,12 @@ +{ + "hash": "68ba5ba7f5ee0162aacd32b63d4f3b5e", + "result": { + "engine": "jupyter", + "markdown": "---\ntitle: LlmVec\n---\n\n\n\n`LlmVec(self, backend='', model='', _cache='_mall_cache', **kwargs)`\n\nClass that adds ability to use an LLM to run batch predictions\n\n\n::: {#59a49358 .cell execution_count=1}\n``` {.python .cell-code}\nfrom chatlas import ChatOllama\nfrom mall import LlmVec\n\nchat = ChatOllama(model = \"llama3.2\")\n\nllm = LlmVec(chat) \n```\n\n::: {.cell-output .cell-output-stderr}\n```\n/Users/edgar/Projects/mall/python/.venv/lib/python3.12/site-packages/pydantic/_internal/_fields.py:132: UserWarning: Field \"model_format\" in ContentToolResult has conflict with protected namespace \"model_\".\n\nYou may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\n warnings.warn(\n```\n:::\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [classify](#mall.LlmVec.classify) | Classify text into specific categories. |\n| [custom](#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. |\n| [extract](#mall.LlmVec.extract) | Pull a specific label from the text. |\n| [sentiment](#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis |\n| [summarize](#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. |\n| [translate](#mall.LlmVec.translate) | Translate text into another language. |\n| [verify](#mall.LlmVec.verify) | Check to see if something is true about the text. |\n\n### classify { #mall.LlmVec.classify }\n\n`LlmVec.classify(x, labels='', additional='')`\n\nClassify text into specific categories.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#ce63dedd .cell execution_count=2}\n``` {.python .cell-code}\nllm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent'])\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```\n['urgent', None]\n```\n:::\n:::\n\n\n### custom { #mall.LlmVec.custom }\n\n`LlmVec.custom(x, prompt='', valid_resps='')`\n\nProvide the full prompt that the LLM will process.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|----------|--------|----------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` |\n\n### extract { #mall.LlmVec.extract }\n\n`LlmVec.extract(x, labels='', additional='')`\n\nPull a specific label from the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|--------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#59271e0e .cell execution_count=3}\n``` {.python .cell-code}\nllm.extract(['bob smith, 123 3rd street'], labels=['name', 'address'])\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```\n['| bob smith | 123 3rd street |']\n```\n:::\n:::\n\n\n### sentiment { #mall.LlmVec.sentiment }\n\n`LlmVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')`\n\nUse an LLM to run a sentiment analysis\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------------|----------------------------------------------------------------|---------------------------------------|\n| `x` | list | A list of texts | _required_ |\n| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#0647089e .cell execution_count=4}\n``` {.python .cell-code}\nllm.sentiment(['I am happy', 'I am sad'])\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n['positive', 'negative']\n```\n:::\n:::\n\n\n### summarize { #mall.LlmVec.summarize }\n\n`LlmVec.summarize(x, max_words=10, additional='')`\n\nSummarize the text down to a specific number of words.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|---------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `max_words` | int | Maximum number of words to use for the summary | `10` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#bb671e3c .cell execution_count=5}\n``` {.python .cell-code}\nllm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```\n['this tv has exceeded expectations']\n```\n:::\n:::\n\n\n### translate { #mall.LlmVec.translate }\n\n`LlmVec.translate(x, language='', additional='')`\n\nTranslate text into another language.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `language` | str | The target language to translate to. For example 'French'. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#9465c6b2 .cell execution_count=6}\n``` {.python .cell-code}\nllm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish')\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n['Esto ha sido la mejor televisión que he tenido, gran pantalla y sonido.']\n```\n:::\n:::\n\n\n### verify { #mall.LlmVec.verify }\n\n`LlmVec.verify(x, what='', yes_no=[1, 0], additional='')`\n\nCheck to see if something is true about the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `what` | str | The statement or question that needs to be verified against the provided text | `''` |\n| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n", + "supporting": [ + "LlmVec_files" + ], + "filters": [], + "includes": {} + } +} \ No newline at end of file diff --git a/_quarto.yml b/_quarto.yml index e592821..c5d7e7a 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -59,10 +59,15 @@ quartodoc: out_index: _api_index.qmd dynamic: true sections: - - title: mall + - title: Polars desc: '' contents: - name: MallFrame + - title: Vectors + desc: '' + contents: + - name: LlmVec + pkgsite: dir: r diff --git a/objects.json b/objects.json index 00ec9f7..f381cae 100644 --- a/objects.json +++ b/objects.json @@ -1 +1 @@ -{"project": "mall", "version": "0.0.9999", "count": 18, "items": [{"name": "mall.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "-"}, {"name": "mall.polars.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "mall.MallFrame.classify"}, {"name": "mall.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "-"}, {"name": "mall.polars.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "mall.MallFrame.custom"}, {"name": "mall.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "-"}, {"name": "mall.polars.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "mall.MallFrame.extract"}, {"name": "mall.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "-"}, {"name": "mall.polars.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "mall.MallFrame.sentiment"}, {"name": "mall.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "-"}, {"name": "mall.polars.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "mall.MallFrame.summarize"}, {"name": "mall.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "-"}, {"name": "mall.polars.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "mall.MallFrame.translate"}, {"name": "mall.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "-"}, {"name": "mall.polars.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "mall.MallFrame.use"}, {"name": "mall.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "-"}, {"name": "mall.polars.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "mall.MallFrame.verify"}, {"name": "mall.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "-"}, {"name": "mall.polars.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "mall.MallFrame"}]} \ No newline at end of file +{"project": "mall", "version": "0.0.9999", "count": 34, "items": [{"name": "mall.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "-"}, {"name": "mall.polars.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "mall.MallFrame.classify"}, {"name": "mall.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "-"}, {"name": "mall.polars.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "mall.MallFrame.custom"}, {"name": "mall.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "-"}, {"name": "mall.polars.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "mall.MallFrame.extract"}, {"name": "mall.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "-"}, {"name": "mall.polars.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "mall.MallFrame.sentiment"}, {"name": "mall.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "-"}, {"name": "mall.polars.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "mall.MallFrame.summarize"}, {"name": "mall.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "-"}, {"name": "mall.polars.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "mall.MallFrame.translate"}, {"name": "mall.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "-"}, {"name": "mall.polars.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "mall.MallFrame.use"}, {"name": "mall.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "-"}, {"name": "mall.polars.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "mall.MallFrame.verify"}, {"name": "mall.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "-"}, {"name": "mall.polars.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "mall.MallFrame"}, {"name": "mall.LlmVec.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.classify", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.classify", "dispname": "mall.LlmVec.classify"}, {"name": "mall.LlmVec.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.custom", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.custom", "dispname": "mall.LlmVec.custom"}, {"name": "mall.LlmVec.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.extract", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.extract", "dispname": "mall.LlmVec.extract"}, {"name": "mall.LlmVec.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.sentiment", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.sentiment", "dispname": "mall.LlmVec.sentiment"}, {"name": "mall.LlmVec.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.summarize", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.summarize", "dispname": "mall.LlmVec.summarize"}, {"name": "mall.LlmVec.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.translate", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.translate", "dispname": "mall.LlmVec.translate"}, {"name": "mall.LlmVec.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.verify", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.verify", "dispname": "mall.LlmVec.verify"}, {"name": "mall.LlmVec", "domain": "py", "role": "class", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec", "dispname": "-"}, {"name": "mall.llmvec.LlmVec", "domain": "py", "role": "class", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec", "dispname": "mall.LlmVec"}]} \ No newline at end of file diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 79629d2..8ace983 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -75,7 +75,7 @@ def summarize(self, x, max_words=10, additional="") -> list: ------ ```{python} - llm.summarize('This has been the best TV I've ever used. Great screen, and sound.', max_words = 5) + llm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5) ``` """ return llm_loop( @@ -103,7 +103,7 @@ def translate(self, x, language="", additional="") -> list: ------ ```{python} - llm.summarize('This has been the best TV I've ever used. Great screen, and sound.', language = 'spanish') + llm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish') ``` """ @@ -133,7 +133,7 @@ def classify(self, x, labels="", additional="") -> list: ------ ```{python} - llm.classify(["this is important!", "there is no rush"], ["urgent", "not urgent"]) + llm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent']) ``` """ return llm_loop( @@ -162,7 +162,7 @@ def extract(self, x, labels="", additional="") -> list: ------ ```{python} - llm.extract(["bob smith, 123 3rd street"], labels=["name", "address"]) + llm.extract(['bob smith, 123 3rd street'], labels=['name', 'address']) ``` """ return llm_loop(x=x, msg=extract(labels, additional=additional), use=self._use) diff --git a/reference/LlmVec.qmd b/reference/LlmVec.qmd new file mode 100644 index 0000000..b12c38c --- /dev/null +++ b/reference/LlmVec.qmd @@ -0,0 +1,154 @@ +# LlmVec { #mall.LlmVec } + +`LlmVec(self, backend='', model='', _cache='_mall_cache', **kwargs)` + +Class that adds ability to use an LLM to run batch predictions + +```{python} +from chatlas import ChatOllama +from mall import LlmVec + +chat = ChatOllama(model = "llama3.2") + +llm = LlmVec(chat) +``` + +## Methods + +| Name | Description | +| --- | --- | +| [classify](#mall.LlmVec.classify) | Classify text into specific categories. | +| [custom](#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. | +| [extract](#mall.LlmVec.extract) | Pull a specific label from the text. | +| [sentiment](#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis | +| [summarize](#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. | +| [translate](#mall.LlmVec.translate) | Translate text into another language. | +| [verify](#mall.LlmVec.verify) | Check to see if something is true about the text. | + +### classify { #mall.LlmVec.classify } + +`LlmVec.classify(x, labels='', additional='')` + +Classify text into specific categories. + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------| +| `x` | list | A list of texts | _required_ | +| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` | +| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | + +#### Examples + +```{python} +llm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent']) +``` + +### custom { #mall.LlmVec.custom } + +`LlmVec.custom(x, prompt='', valid_resps='')` + +Provide the full prompt that the LLM will process. + +#### Parameters + +| Name | Type | Description | Default | +|----------|--------|----------------------------------------------------|------------| +| `x` | list | A list of texts | _required_ | +| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` | + +### extract { #mall.LlmVec.extract } + +`LlmVec.extract(x, labels='', additional='')` + +Pull a specific label from the text. + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------|--------------------------------------------------------------------------------|------------| +| `x` | list | A list of texts | _required_ | +| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` | +| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | + +#### Examples + +```{python} +llm.extract(['bob smith, 123 3rd street'], labels=['name', 'address']) +``` + +### sentiment { #mall.LlmVec.sentiment } + +`LlmVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')` + +Use an LLM to run a sentiment analysis + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------------|----------------------------------------------------------------|---------------------------------------| +| `x` | list | A list of texts | _required_ | +| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` | +| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | + +#### Examples + +```{python} +llm.sentiment(['I am happy', 'I am sad']) +``` + +### summarize { #mall.LlmVec.summarize } + +`LlmVec.summarize(x, max_words=10, additional='')` + +Summarize the text down to a specific number of words. + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------|---------------------------------------------------|------------| +| `x` | list | A list of texts | _required_ | +| `max_words` | int | Maximum number of words to use for the summary | `10` | +| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | + +#### Examples + +```{python} +llm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5) +``` + +### translate { #mall.LlmVec.translate } + +`LlmVec.translate(x, language='', additional='')` + +Translate text into another language. + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------|------------------------------------------------------------|------------| +| `x` | list | A list of texts | _required_ | +| `language` | str | The target language to translate to. For example 'French'. | `''` | +| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | + +#### Examples + +```{python} +llm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish') +``` + +### verify { #mall.LlmVec.verify } + +`LlmVec.verify(x, what='', yes_no=[1, 0], additional='')` + +Check to see if something is true about the text. + +#### Parameters + +| Name | Type | Description | Default | +|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| `x` | list | A list of texts | _required_ | +| `what` | str | The statement or question that needs to be verified against the provided text | `''` | +| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` | +| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | \ No newline at end of file diff --git a/reference/MallFrame.qmd b/reference/MallFrame.qmd index 05031ff..4496d98 100644 --- a/reference/MallFrame.qmd +++ b/reference/MallFrame.qmd @@ -241,12 +241,12 @@ interact with the LLM. #### Parameters -| Name | Type | Description | Default | -|------------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| -| `backend` | str \| Chat | The name of the backend to use, or a `chatlas` chat object. At the beginning of the session it defaults to "ollama". If passing `""`, it will remain unchanged | `''` | -| `model` | str | The name of the model tha the backend should use. At the beginning of the session it defaults to "llama3.2". If passing `""`, it will remain unchanged | `''` | -| `_cache` | str | The path of where to save the cached results. Passing `""` disables the cache | `'_mall_cache'` | -| `**kwargs` | | Arguments to pass to the downstream Python call. In this case, the `chat` function in `ollama` | `{}` | +| Name | Type | Description | Default | +|------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| `backend` | str \| Chat \| Client | The name of the backend to use, or an Ollama Client object, or a `chatlas` Chat object. At the beginning of the session it defaults to "ollama". If passing `""`, it will remain unchanged | `''` | +| `model` | str | The name of the model tha the backend should use. At the beginning of the session it defaults to "llama3.2". If passing `""`, it will remain unchanged | `''` | +| `_cache` | str | The path of where to save the cached results. Passing `""` disables the cache | `'_mall_cache'` | +| `**kwargs` | | Arguments to pass to the downstream Python call. In this case, the `chat` function in `ollama` | `{}` | #### Examples @@ -274,7 +274,7 @@ reviews.llm.use(_cache = "") ``` ```{python} -# Use a `chatlas` object +# Use a `chatlas` object from chatlas import ChatOpenAI chat = ChatOpenAI() reviews.llm.use(chat) diff --git a/reference/_api_index.qmd b/reference/_api_index.qmd index 093be32..a543276 100644 --- a/reference/_api_index.qmd +++ b/reference/_api_index.qmd @@ -1,9 +1,17 @@ # Function reference {.doc .doc-index} -## mall +## Polars | | | | --- | --- | -| [MallFrame](MallFrame.qmd#mall.MallFrame) | Extension to Polars that add ability to use | \ No newline at end of file +| [MallFrame](MallFrame.qmd#mall.MallFrame) | Extension to Polars that add ability to use | + +## Vectors + + + +| | | +| --- | --- | +| [LlmVec](LlmVec.qmd#mall.LlmVec) | Class that adds ability to use an LLM to run batch predictions | \ No newline at end of file From b87ab6c964b1e43bd93b105c79c658d13ea9fbb0 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 6 Jun 2025 08:22:38 -0500 Subject: [PATCH 11/23] Finishes reference --- .../LlmVec/execute-results/html.json | 4 ++-- reference/LlmVec.qmd | 5 +++++ reference/index.qmd | 19 ++++++++++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/_freeze/reference/LlmVec/execute-results/html.json b/_freeze/reference/LlmVec/execute-results/html.json index b23b7e6..b66eefc 100644 --- a/_freeze/reference/LlmVec/execute-results/html.json +++ b/_freeze/reference/LlmVec/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "68ba5ba7f5ee0162aacd32b63d4f3b5e", + "hash": "d14e6fbb48f3ed7d5b6e00d8eff1914e", "result": { "engine": "jupyter", - "markdown": "---\ntitle: LlmVec\n---\n\n\n\n`LlmVec(self, backend='', model='', _cache='_mall_cache', **kwargs)`\n\nClass that adds ability to use an LLM to run batch predictions\n\n\n::: {#59a49358 .cell execution_count=1}\n``` {.python .cell-code}\nfrom chatlas import ChatOllama\nfrom mall import LlmVec\n\nchat = ChatOllama(model = \"llama3.2\")\n\nllm = LlmVec(chat) \n```\n\n::: {.cell-output .cell-output-stderr}\n```\n/Users/edgar/Projects/mall/python/.venv/lib/python3.12/site-packages/pydantic/_internal/_fields.py:132: UserWarning: Field \"model_format\" in ContentToolResult has conflict with protected namespace \"model_\".\n\nYou may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\n warnings.warn(\n```\n:::\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [classify](#mall.LlmVec.classify) | Classify text into specific categories. |\n| [custom](#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. |\n| [extract](#mall.LlmVec.extract) | Pull a specific label from the text. |\n| [sentiment](#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis |\n| [summarize](#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. |\n| [translate](#mall.LlmVec.translate) | Translate text into another language. |\n| [verify](#mall.LlmVec.verify) | Check to see if something is true about the text. |\n\n### classify { #mall.LlmVec.classify }\n\n`LlmVec.classify(x, labels='', additional='')`\n\nClassify text into specific categories.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#ce63dedd .cell execution_count=2}\n``` {.python .cell-code}\nllm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent'])\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```\n['urgent', None]\n```\n:::\n:::\n\n\n### custom { #mall.LlmVec.custom }\n\n`LlmVec.custom(x, prompt='', valid_resps='')`\n\nProvide the full prompt that the LLM will process.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|----------|--------|----------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` |\n\n### extract { #mall.LlmVec.extract }\n\n`LlmVec.extract(x, labels='', additional='')`\n\nPull a specific label from the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|--------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#59271e0e .cell execution_count=3}\n``` {.python .cell-code}\nllm.extract(['bob smith, 123 3rd street'], labels=['name', 'address'])\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```\n['| bob smith | 123 3rd street |']\n```\n:::\n:::\n\n\n### sentiment { #mall.LlmVec.sentiment }\n\n`LlmVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')`\n\nUse an LLM to run a sentiment analysis\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------------|----------------------------------------------------------------|---------------------------------------|\n| `x` | list | A list of texts | _required_ |\n| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#0647089e .cell execution_count=4}\n``` {.python .cell-code}\nllm.sentiment(['I am happy', 'I am sad'])\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n['positive', 'negative']\n```\n:::\n:::\n\n\n### summarize { #mall.LlmVec.summarize }\n\n`LlmVec.summarize(x, max_words=10, additional='')`\n\nSummarize the text down to a specific number of words.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|---------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `max_words` | int | Maximum number of words to use for the summary | `10` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#bb671e3c .cell execution_count=5}\n``` {.python .cell-code}\nllm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```\n['this tv has exceeded expectations']\n```\n:::\n:::\n\n\n### translate { #mall.LlmVec.translate }\n\n`LlmVec.translate(x, language='', additional='')`\n\nTranslate text into another language.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `language` | str | The target language to translate to. For example 'French'. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#9465c6b2 .cell execution_count=6}\n``` {.python .cell-code}\nllm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish')\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n['Esto ha sido la mejor televisión que he tenido, gran pantalla y sonido.']\n```\n:::\n:::\n\n\n### verify { #mall.LlmVec.verify }\n\n`LlmVec.verify(x, what='', yes_no=[1, 0], additional='')`\n\nCheck to see if something is true about the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `what` | str | The statement or question that needs to be verified against the provided text | `''` |\n| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n", + "markdown": "---\ntitle: LlmVec\n---\n\n\n\n`LlmVec(self, backend='', model='', _cache='_mall_cache', **kwargs)`\n\nClass that adds ability to use an LLM to run batch predictions\n\n\n\n::: {#83f06b2c .cell execution_count=2}\n``` {.python .cell-code}\nfrom chatlas import ChatOllama\nfrom mall import LlmVec\n\nchat = ChatOllama(model = \"llama3.2\")\n\nllm = LlmVec(chat) \n```\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [classify](#mall.LlmVec.classify) | Classify text into specific categories. |\n| [custom](#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. |\n| [extract](#mall.LlmVec.extract) | Pull a specific label from the text. |\n| [sentiment](#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis |\n| [summarize](#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. |\n| [translate](#mall.LlmVec.translate) | Translate text into another language. |\n| [verify](#mall.LlmVec.verify) | Check to see if something is true about the text. |\n\n### classify { #mall.LlmVec.classify }\n\n`LlmVec.classify(x, labels='', additional='')`\n\nClassify text into specific categories.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#730755cf .cell execution_count=3}\n``` {.python .cell-code}\nllm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent'])\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n['urgent', None]\n```\n:::\n:::\n\n\n### custom { #mall.LlmVec.custom }\n\n`LlmVec.custom(x, prompt='', valid_resps='')`\n\nProvide the full prompt that the LLM will process.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|----------|--------|----------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` |\n\n### extract { #mall.LlmVec.extract }\n\n`LlmVec.extract(x, labels='', additional='')`\n\nPull a specific label from the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|--------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#def9cfbf .cell execution_count=4}\n``` {.python .cell-code}\nllm.extract(['bob smith, 123 3rd street'], labels=['name', 'address'])\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n['| bob smith | 123 3rd street |']\n```\n:::\n:::\n\n\n### sentiment { #mall.LlmVec.sentiment }\n\n`LlmVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')`\n\nUse an LLM to run a sentiment analysis\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------------|----------------------------------------------------------------|---------------------------------------|\n| `x` | list | A list of texts | _required_ |\n| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#567dc847 .cell execution_count=5}\n``` {.python .cell-code}\nllm.sentiment(['I am happy', 'I am sad'])\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```\n['positive', 'negative']\n```\n:::\n:::\n\n\n### summarize { #mall.LlmVec.summarize }\n\n`LlmVec.summarize(x, max_words=10, additional='')`\n\nSummarize the text down to a specific number of words.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|---------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `max_words` | int | Maximum number of words to use for the summary | `10` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#9c92779d .cell execution_count=6}\n``` {.python .cell-code}\nllm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5)\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```\n['this tv has exceeded expectations']\n```\n:::\n:::\n\n\n### translate { #mall.LlmVec.translate }\n\n`LlmVec.translate(x, language='', additional='')`\n\nTranslate text into another language.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `language` | str | The target language to translate to. For example 'French'. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#d41bc3b8 .cell execution_count=7}\n``` {.python .cell-code}\nllm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish')\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```\n['Esto ha sido la mejor televisión que he tenido, gran pantalla y sonido.']\n```\n:::\n:::\n\n\n### verify { #mall.LlmVec.verify }\n\n`LlmVec.verify(x, what='', yes_no=[1, 0], additional='')`\n\nCheck to see if something is true about the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `what` | str | The statement or question that needs to be verified against the provided text | `''` |\n| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n", "supporting": [ "LlmVec_files" ], diff --git a/reference/LlmVec.qmd b/reference/LlmVec.qmd index b12c38c..7f09fe6 100644 --- a/reference/LlmVec.qmd +++ b/reference/LlmVec.qmd @@ -4,6 +4,11 @@ Class that adds ability to use an LLM to run batch predictions +```{python} +#| include: false +import mall +``` + ```{python} from chatlas import ChatOllama from mall import LlmVec diff --git a/reference/index.qmd b/reference/index.qmd index 22633e7..f7bc2be 100644 --- a/reference/index.qmd +++ b/reference/index.qmd @@ -8,7 +8,7 @@ -[MallFrame](MallFrame.qmd#mall.MallFrame) +### MallFrame       Extension to Polars that add ability to use an LLM to run batch predictions over a data frame @@ -24,5 +24,22 @@ an LLM to run batch predictions over a data frame | [use](MallFrame.qmd#mall.MallFrame.use) | Define the model, backend, and other options to use to | | [verify](MallFrame.qmd#mall.MallFrame.verify) | Check to see if something is true about the text. | +

+ +### LlmVec + +       Class that adds ability to use an LLM +to run batch predictions + +| Name | Description | +| --- | --- | +| [classify](LlmVec.qmd#mall.LlmVec.classify) | Classify text into specific categories. | +| [custom](LlmVec.qmd#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. | +| [extract](LlmVec.qmd#mall.LlmVec.extract) | Pull a specific label from the text. | +| [sentiment](LlmVec.qmd#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis | +| [summarize](LlmVec.qmd#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. | +| [translate](LlmVec.qmd#mall.LlmVec.translate) | Translate text into another language. | +| [verify](LlmVec.qmd#mall.LlmVec.verify) | Check to see if something is true about the text. | + ::: From 119a5b22f81b9a682d2aaeef32ce8da62fc06a93 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 6 Jun 2025 13:27:38 -0500 Subject: [PATCH 12/23] Adds vector functions section for Python --- _freeze/index/execute-results/html.json | 4 +-- index.qmd | 38 ++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json index 57f6a93..61cfa33 100644 --- a/_freeze/index/execute-results/html.json +++ b/_freeze/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "fd815bf5a06da5a597e370f770ff1b8e", + "hash": "24845706e96c60bd60659a54c458a8e2", "result": { "engine": "knitr", - "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\n\n[![PyPi](https://img.shields.io/pypi/v/mlverse-mall)](https://pypi.org/project/mlverse-mall/) [![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) \\| \"CRAN [![R check](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml) \\| [![Package coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main)\n\n\n\nUse Large Language Models (LLM) to run Natural Language Processing (NLP) \noperations against your data. It takes advantage of the LLMs general language\ntraining in order to get the predictions, thus removing the need to train a new\nNLP model. `mall` is available for R and Python.\n\nIt works by running multiple LLM predictions against your data. The predictions\nare processed row-wise over a specified column. It relies on the \"one-shot\" \nprompt technique to instruct the LLM on a particular NLP operation to perform. \nThe package includes prompts to perform the following specific NLP operations:\n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Verify that something is true](#verify) about the text (binary)\n\nFor other NLP operations, `mall` offers the ability for you to [write your own prompt](#custom-prompt).\n\n\n\nIn **R** The functions inside `mall` are designed to easily work with piped \ncommands, such as `dplyr`.\n\n``` r\nreviews |>\n llm_sentiment(review)\n```\n\n\n\nIn **Python**, `mall` is a library extension to [Polars](https://pola.rs/).\n\n``` python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find new ways to help data scientists use LLMs in their daily work.\nUnlike the familiar interfaces, such as chatting and code completion, this \ninterface runs your text data directly against the LLM. This package is inspired\nby the SQL AI functions now offered by vendors such as [Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and\nprovide surprisingly accurate predictions. This saves the data scientist the \nneed to write and tune an NLP model.\n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run in \nyour machine with good accuracy. It also makes it possible to take \nadvantage of LLMs at your institution, since the data will not leave the \ncorporate network. Additionally, LLM management and integration platforms, such\nas [Ollama](https://ollama.com/), are now very easy to setup and use. `mall`\nuses Ollama as to interact with local LLMs.\n\nThe development version of `mall` lets you **use external LLMs such as\n[OpenAI](https://openai.com/), [Gemini](https://gemini.google.com/) and\n[Anthropic](https://www.anthropic.com/)**. In R, `mall` uses the\n[`ellmer`](https://ellmer.tidyverse.org/index.html)\npackage to integrate with the external LLM, and the \n[`chatlas`](https://posit-dev.github.io/chatlas/) package to integrate in Python.\n\n## Install `mall` {#get-started}\n\nInstall the package to get started:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nOfficial version from CRAN:\n\n``` r\ninstall.packages(\"mall\")\n```\n\nDevelopment version from GitHub *(required for remote LLM integration)*:\n\n``` r\npak::pak(\"mlverse/mall/r\")\n```\n\n## Python\n\nOfficial version from PyPi:\n\n``` python\npip install mlverse-mall\n```\n\nDevelopment version from GitHub:\n\n``` python\npip install \"mlverse-mall @ git+https://git@github.com/mlverse/mall.git#subdirectory=python\"\n```\n:::\n\n## Setup the LLM\n\nChoose one of the two following options to setup LLM connectivity:\n\n### Local LLMs, via Ollama {#local-llms}\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n- Install Ollama in your machine. The `ollamar` package's website provides this \n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using \nLlama 3.2 to test. To get that model you can run:\n\n ``` r\n ollamar::pull(\"llama3.2\")\n ```\n\n## Python\n\n- Install the official Ollama library\n\n ``` python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package\nusing Llama 3.2 to test. To get that model you can run:\n\n ``` python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n### Remote LLMs {#remote-llms}\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` uses the `ellmer` package as the integration point to the LLM. This package supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install `ellmer`\n\n ``` r\n install.packages(\"ellmer\")\n ```\n\n- Refer to `ellmer`'s documentation to find out how to setup the connections with your selected provider: \n\n- Let `mall` know which `ellmer` object to use during the R session. To do this, call `llm_use()`. Here is an example of using OpenAI:\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n library(mall)\n library(ellmer)\n chat <- chat_openai()\n #> Using model = \"gpt-4o\".\n llm_use(chat)\n #> \n #> ── mall session object \n #> Backend: ellmerLLM session: model:gpt-4oR session:\n #> cache_folder:/var/folders/y_/f_0cx_291nl0s8h26t4jg6ch0000gp/T//RtmpMrbC3S/_mall_cacheb51979c641c2\n ```\n :::\n\n\n**Set a default LLM for your R session**\n\nAs a convenience, `mall` is able to automatically establish a connection with the\nLLM at the beginning o R session. To do this you can use the `.mall_chat` option:\n\n```r\noptions(.mall_chat = ellmer::chat_openai(model = \"gpt-4o\"))\n```\n\nAdd this line to your *.Rprofile* file in order for that code to run every time\nyou start R. You can call `usethis::edit_r_profile()` to open your .Rprofile\nfile so you can add the option. \n\n## Python\n\n`mall` uses the `chatlas` package as the integration point to the LLM. This \npackage supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install the `chatlas` library\n\n ``` python\n pip install chatlas\n ```\n\n- Refer to `chatlas`'s documentation to find out how to setup the connections\nwith your selected provider: \n\n- Let `mall` know which `chatlas` object to use during the Python session. \nTo do this, call `llm_use()`. Here is an example of using OpenAI:\n\n ``` python\n import mall\n from chatlas import ChatOpenAI\n\n chat = ChatOpenAI()\n\n data = mall.MallData\n reviews = data.reviews\n\n reviews.llm.use(chat)\n ```\n:::\n\n## LLM functions\n\nWe will start with loading a very small data set contained in `mall`. It has \n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 Ɨ 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n## Python\n\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \ndata = mall.MallData\nreviews = data.reviews\n\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
review
"This has been the best TV I've ever used. Great screen, and sound."
"I regret buying this laptop. It is too slow and the keyboard is too noisy"
"Not sure how to feel about my new washing machine. Great color, but hard to figure"
\n```\n\n:::\n:::\n\n:::\n\n\n\n### Sentiment {#sentiment}\n\nAutomatically returns \"positive\", \"negative\", or \"neutral\" based on the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 Ɨ 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_sentiment.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsentiment
"This has been the best TV I've ever used. Great screen, and sound.""positive"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""negative"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""neutral"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.sentiment)\n:::\n\n### Summarize {#summarize}\n\nThere may be a need to reduce the number of words in a given text. Typically to \nmake it easier to understand its intent. The function has an argument to control \nthe maximum number of words to output (`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 Ɨ 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… it's a great tv \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_summarize.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsummary
"This has been the best TV I've ever used. Great screen, and sound.""great tv with good features"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""laptop purchase was a mistake"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""feeling uncertain about new purchase"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.summarize)\n:::\n\n### Classify {#classify}\n\nUse the LLM to categorize the text into one of the options you provide:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 Ɨ 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_classify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewclassify
"This has been the best TV I've ever used. Great screen, and sound.""appliance"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""computer"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""appliance"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.classify)\n:::\n\n### Extract {#extract}\n\nOne of the most interesting use cases Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request \nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that \nin the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 Ɨ 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_extract.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewextract
"This has been the best TV I've ever used. Great screen, and sound.""tv"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""laptop"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""washing machine"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.extract)\n:::\n\n### Verify {#verify}\n\nThis functions allows you to check and see if a statement is true, based on the\nprovided text. By default, it will return a 1 for \"yes\", and 0 for \"no\". This \ncan be customized.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_verify(review, \"is the customer happy with the purchase\")\n#> # A tibble: 3 Ɨ 2\n#> review .verify\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1 \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too n… 0 \n#> 3 Not sure how to feel about my new washing machine. Great color, but h… 0\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_verify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.verify(\"review\", \"is the customer happy with the purchase\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewverify
"This has been the best TV I've ever used. Great screen, and sound."1
"I regret buying this laptop. It is too slow and the keyboard is too noisy"0
"Not sure how to feel about my new washing machine. Great color, but hard to figure"0
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.verify)\n:::\n\n### Translate {#translate}\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The \ntranslation accuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 Ɨ 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_translate.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewtranslation
"This has been the best TV I've ever used. Great screen, and sound.""Esta ha sido la mejor televisión que he utilizado hasta ahora. Gran pantalla y sonido."
"I regret buying this laptop. It is too slow and the keyboard is too noisy""Me arrepiento de comprar este portƔtil. Es demasiado lento y la tecla es demasiado ruidosa."
"Not sure how to feel about my new washing machine. Great color, but hard to figure""No estoy seguro de cómo sentirme con mi nueva lavadora. Un color maravilloso, pero muy difĆ­cil de en…
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.translate)\n:::\n\n### Custom prompt {#custom-prompt}\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 Ɨ 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_custom.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmy_prompt = (\n \"Answer a question.\"\n \"Return only the answer, no explanation\"\n \"Acceptable answers are 'yes', 'no'\"\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewcustom
"This has been the best TV I've ever used. Great screen, and sound.""Yes"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""No"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""No"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.custom)\n:::\n\n## Model selection and settings\n\n#### Local LLMs via Ollama {#settings-local}\n\nYou can set the model and its options to use when calling the LLM. In this case,\nwe refer to options as model specific things that can be set, such as seed or \ntemperature.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nInvoking an `llm` function will automatically initialize a model selection if \nyou don't have one selected yet. If there is only one option, it will pre-select\nit for you. If there are more than one available models, then `mall` will \npresent you as menu selection so you can select which model you wish to use.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the function\nthat actually runs the prediction. In the case of Ollama, that function is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html).\n\nThe model to use, and other options can be set for the current R session\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n## Python\n\nThe model and options to be used will be defined at the Polars data frame object \nlevel. If not passed, the default model will be **llama3.2**.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(\"ollama\", \"llama3.2\", options = dict(seed = 100))\n```\n:::\n\n:::\n\n#### Remote LLMs\n\nThe provider and model selection will be based on the chat object you create. \nAny model related setting, such as temperature, seed and others, should be\nset at the time of the object creation as well.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\nlibrary(ellmer)\nchat <- chat_openai(model = \"gpt-4o\", seed = 100)\nllm_use(chat)\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall\nfrom chatlas import ChatOpenAI\nchat = ChatOpenAI(model = \"gpt-4o\", seed= 100)\ndata = mall.MallData\nreviews = data.reviews\nreviews.llm.use(chat)\n```\n:::\n\n:::\n\n\n## Results caching\n\nBy default `mall` caches the requests and corresponding results from a given \nLLM run. Each response is saved as individual JSON files. By default, the folder\nname is `_mall_cache`. The folder name can be customized, if needed. Also, the\ncaching can be turned off by setting the argument to empty (`\"\"`).\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"_my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"\")\n```\n:::\n\n:::\n\nFor more information see the [Caching Results](articles/caching.qmd) article.\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general \nmodel. It was fitted using a vast amount of data. So determining a response for\neach row, takes longer than if using a manually created NLP model. The default\nmodel used in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), which \nwas fitted using 3B parameters.\n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a \nlot of data to be evaluated.\n\nAnother consideration is the novelty of this approach. Early tests are providing\nencouraging results. But you, as an user, will still need to keep in mind that \nthe predictions will not be infallible, so always check the output. At this time,\nI think the best use for this method, is for a quick analysis.\n\n## Vector functions (R only)\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n\n", + "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\n\n[![PyPi](https://img.shields.io/pypi/v/mlverse-mall)](https://pypi.org/project/mlverse-mall/) [![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) \\| \"CRAN [![R check](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml) \\| [![Package coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main)\n\n\n\nUse Large Language Models (LLM) to run Natural Language Processing (NLP) \noperations against your data. It takes advantage of the LLMs general language\ntraining in order to get the predictions, thus removing the need to train a new\nNLP model. `mall` is available for R and Python.\n\nIt works by running multiple LLM predictions against your data. The predictions\nare processed row-wise over a specified column. It relies on the \"one-shot\" \nprompt technique to instruct the LLM on a particular NLP operation to perform. \nThe package includes prompts to perform the following specific NLP operations:\n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Verify that something is true](#verify) about the text (binary)\n\nFor other NLP operations, `mall` offers the ability for you to [write your own prompt](#custom-prompt).\n\n\n\nIn **R** The functions inside `mall` are designed to easily work with piped \ncommands, such as `dplyr`.\n\n``` r\nreviews |>\n llm_sentiment(review)\n```\n\n\n\nIn **Python**, `mall` is a library extension to [Polars](https://pola.rs/).\n\n``` python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find new ways to help data scientists use LLMs in their daily work.\nUnlike the familiar interfaces, such as chatting and code completion, this \ninterface runs your text data directly against the LLM. This package is inspired\nby the SQL AI functions now offered by vendors such as [Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and\nprovide surprisingly accurate predictions. This saves the data scientist the \nneed to write and tune an NLP model.\n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run in \nyour machine with good accuracy. It also makes it possible to take \nadvantage of LLMs at your institution, since the data will not leave the \ncorporate network. Additionally, LLM management and integration platforms, such\nas [Ollama](https://ollama.com/), are now very easy to setup and use. `mall`\nuses Ollama as to interact with local LLMs.\n\nThe development version of `mall` lets you **use external LLMs such as\n[OpenAI](https://openai.com/), [Gemini](https://gemini.google.com/) and\n[Anthropic](https://www.anthropic.com/)**. In R, `mall` uses the\n[`ellmer`](https://ellmer.tidyverse.org/index.html)\npackage to integrate with the external LLM, and the \n[`chatlas`](https://posit-dev.github.io/chatlas/) package to integrate in Python.\n\n## Install `mall` {#get-started}\n\nInstall the package to get started:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nOfficial version from CRAN:\n\n``` r\ninstall.packages(\"mall\")\n```\n\nDevelopment version from GitHub *(required for remote LLM integration)*:\n\n``` r\npak::pak(\"mlverse/mall/r\")\n```\n\n## Python\n\nOfficial version from PyPi:\n\n``` python\npip install mlverse-mall\n```\n\nDevelopment version from GitHub:\n\n``` python\npip install \"mlverse-mall @ git+https://git@github.com/mlverse/mall.git#subdirectory=python\"\n```\n:::\n\n## Setup the LLM\n\nChoose one of the two following options to setup LLM connectivity:\n\n### Local LLMs, via Ollama {#local-llms}\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n- Install Ollama in your machine. The `ollamar` package's website provides this \n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using \nLlama 3.2 to test. To get that model you can run:\n\n ``` r\n ollamar::pull(\"llama3.2\")\n ```\n\n## Python\n\n- Install the official Ollama library\n\n ``` python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package\nusing Llama 3.2 to test. To get that model you can run:\n\n ``` python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n### Remote LLMs {#remote-llms}\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` uses the `ellmer` package as the integration point to the LLM. This package supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install `ellmer`\n\n ``` r\n install.packages(\"ellmer\")\n ```\n\n- Refer to `ellmer`'s documentation to find out how to setup the connections with your selected provider: \n\n- Let `mall` know which `ellmer` object to use during the R session. To do this, call `llm_use()`. Here is an example of using OpenAI:\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n library(mall)\n library(ellmer)\n chat <- chat_openai()\n #> Using model = \"gpt-4.1\".\n llm_use(chat)\n #> \n #> ── mall session object \n #> Backend: ellmerLLM session: model:gpt-4.1R session:\n #> cache_folder:/var/folders/y_/f_0cx_291nl0s8h26t4jg6ch0000gp/T//RtmpsrRw39/_mall_cache8057183e82ae\n ```\n :::\n\n\n**Set a default LLM for your R session**\n\nAs a convenience, `mall` is able to automatically establish a connection with the\nLLM at the beginning o R session. To do this you can use the `.mall_chat` option:\n\n```r\noptions(.mall_chat = ellmer::chat_openai(model = \"gpt-4o\"))\n```\n\nAdd this line to your *.Rprofile* file in order for that code to run every time\nyou start R. You can call `usethis::edit_r_profile()` to open your .Rprofile\nfile so you can add the option. \n\n## Python\n\n`mall` uses the `chatlas` package as the integration point to the LLM. This \npackage supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install the `chatlas` library\n\n ``` python\n pip install chatlas\n ```\n\n- Refer to `chatlas`'s documentation to find out how to setup the connections\nwith your selected provider: \n\n- Let `mall` know which `chatlas` object to use during the Python session. \nTo do this, call `llm_use()`. Here is an example of using OpenAI:\n\n ``` python\n import mall\n from chatlas import ChatOpenAI\n\n chat = ChatOpenAI()\n\n data = mall.MallData\n reviews = data.reviews\n\n reviews.llm.use(chat)\n ```\n:::\n\n## LLM functions\n\nWe will start with loading a very small data set contained in `mall`. It has \n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 Ɨ 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n## Python\n\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \ndata = mall.MallData\nreviews = data.reviews\n\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
review
"This has been the best TV I've ever used. Great screen, and sound."
"I regret buying this laptop. It is too slow and the keyboard is too noisy"
"Not sure how to feel about my new washing machine. Great color, but hard to figure"
\n```\n\n:::\n:::\n\n:::\n\n\n\n### Sentiment {#sentiment}\n\nAutomatically returns \"positive\", \"negative\", or \"neutral\" based on the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 Ɨ 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_sentiment.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsentiment
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.sentiment)\n:::\n\n### Summarize {#summarize}\n\nThere may be a need to reduce the number of words in a given text. Typically to \nmake it easier to understand its intent. The function has an argument to control \nthe maximum number of words to output (`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 Ɨ 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… great tv with good features \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_summarize.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsummary
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.summarize)\n:::\n\n### Classify {#classify}\n\nUse the LLM to categorize the text into one of the options you provide:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 Ɨ 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_classify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewclassify
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.classify)\n:::\n\n### Extract {#extract}\n\nOne of the most interesting use cases Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request \nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that \nin the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 Ɨ 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_extract.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewextract
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.extract)\n:::\n\n### Verify {#verify}\n\nThis functions allows you to check and see if a statement is true, based on the\nprovided text. By default, it will return a 1 for \"yes\", and 0 for \"no\". This \ncan be customized.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_verify(review, \"is the customer happy with the purchase\")\n#> # A tibble: 3 Ɨ 2\n#> review .verify\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1 \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too n… 0 \n#> 3 Not sure how to feel about my new washing machine. Great color, but h… 0\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_verify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.verify(\"review\", \"is the customer happy with the purchase\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewverify
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.verify)\n:::\n\n### Translate {#translate}\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The \ntranslation accuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 Ɨ 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_translate.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewtranslation
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.translate)\n:::\n\n### Custom prompt {#custom-prompt}\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 Ɨ 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_custom.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmy_prompt = (\n \"Answer a question.\"\n \"Return only the answer, no explanation\"\n \"Acceptable answers are 'yes', 'no'\"\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewcustom
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.custom)\n:::\n\n## Model selection and settings\n\n#### Local LLMs via Ollama {#settings-local}\n\nYou can set the model and its options to use when calling the LLM. In this case,\nwe refer to options as model specific things that can be set, such as seed or \ntemperature.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nInvoking an `llm` function will automatically initialize a model selection if \nyou don't have one selected yet. If there is only one option, it will pre-select\nit for you. If there are more than one available models, then `mall` will \npresent you as menu selection so you can select which model you wish to use.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the function\nthat actually runs the prediction. In the case of Ollama, that function is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html).\n\nThe model to use, and other options can be set for the current R session\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n## Python\n\nThe model and options to be used will be defined at the Polars data frame object \nlevel. If not passed, the default model will be **llama3.2**.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(\"ollama\", \"llama3.2\", options = dict(seed = 100))\n```\n:::\n\n:::\n\n#### Remote LLMs\n\nThe provider and model selection will be based on the chat object you create. \nAny model related setting, such as temperature, seed and others, should be\nset at the time of the object creation as well.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\nlibrary(ellmer)\nchat <- chat_openai(model = \"gpt-4o\", seed = 100)\nllm_use(chat)\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall\nfrom chatlas import ChatOpenAI\nchat = ChatOpenAI(model = \"gpt-4o\", seed= 100)\ndata = mall.MallData\nreviews = data.reviews\nreviews.llm.use(chat)\n```\n:::\n\n:::\n\n\n## Results caching\n\nBy default `mall` caches the requests and corresponding results from a given \nLLM run. Each response is saved as individual JSON files. By default, the folder\nname is `_mall_cache`. The folder name can be customized, if needed. Also, the\ncaching can be turned off by setting the argument to empty (`\"\"`).\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"_my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"\")\n```\n:::\n\n:::\n\nFor more information see the [Caching Results](articles/caching.qmd) article.\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general \nmodel. It was fitted using a vast amount of data. So determining a response for\neach row, takes longer than if using a manually created NLP model. The default\nmodel used in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), which \nwas fitted using 3B parameters.\n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a \nlot of data to be evaluated.\n\nAnother consideration is the novelty of this approach. Early tests are providing\nencouraging results. But you, as an user, will still need to keep in mind that \nthe predictions will not be infallible, so always check the output. At this time,\nI think the best use for this method, is for a quick analysis.\n\n## Vector functions\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n\n\n## Python \n\n`mall` is also able to process vectors contained in a `list` object. This allows\nus to avoid having to convert a list of texts without having to first convert\nthem into a single column data frame. To use, initialize a new `LlmVec` class\nobject with either an Ollama model, or a `chatlas` `Chat` object, and then\naccess the same NLP functions as the Polars extension.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# Initialize a Chat object\nfrom chatlas import ChatOllama\nchat = ChatOllama(model = \"llama3.2\")\n\n# Pass it to a new LlmVec\nfrom mall import LlmVec\nllm = LlmVec(chat) \n```\n:::\n\n\nAccess the functions via the new LlmVec object, and pass the text to be processed.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nllm.sentiment([\"I am happy\", \"I am sad\"])\n#> ['positive', 'negative']\n```\n:::\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nllm.translate([\"Este es el mejor dia!\"], \"english\")\n#> ['This is the best day!']\n```\n:::\n\n\n\n:::\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/index.qmd b/index.qmd index 53737b6..07286ee 100644 --- a/index.qmd +++ b/index.qmd @@ -597,13 +597,17 @@ encouraging results. But you, as an user, will still need to keep in mind that the predictions will not be infallible, so always check the output. At this time, I think the best use for this method, is for a quick analysis. -## Vector functions (R only) +## Vector functions + +::: {.panel-tabset group="language"} +## R `mall` includes functions that expect a vector, instead of a table, to run the predictions. This should make it easier to test things, such as custom prompts or results of specific text. Each `llm_` function has a corresponding `llm_vec_` function: + ```{r} llm_vec_sentiment("I am happy") ``` @@ -611,3 +615,35 @@ llm_vec_sentiment("I am happy") ```{r} llm_vec_translate("Este es el mejor dia!", "english") ``` + +## Python + +`mall` is also able to process vectors contained in a `list` object. This allows +us to avoid having to convert a list of texts without having to first convert +them into a single column data frame. To use, initialize a new `LlmVec` class +object with either an Ollama model, or a `chatlas` `Chat` object, and then +access the same NLP functions as the Polars extension. + +```{python} +# Initialize a Chat object +from chatlas import ChatOllama +chat = ChatOllama(model = "llama3.2") + +# Pass it to a new LlmVec +from mall import LlmVec +llm = LlmVec(chat) +``` + +Access the functions via the new LlmVec object, and pass the text to be processed. + +```{python} +llm.sentiment(["I am happy", "I am sad"]) +``` + +```{python} +llm.translate(["Este es el mejor dia!"], "english") +``` + + +::: + From 1a95c13431fc48a5099042662f7ea61d85de322b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:48:50 -0500 Subject: [PATCH 13/23] Renames to LLMVec, updates docs and index --- _freeze/index/execute-results/html.json | 4 +- .../LlmVec/execute-results/html.json | 6 +- _quarto.yml | 2 +- index.qmd | 17 +- objects.json | 2 +- python/mall/__init__.py | 4 +- python/mall/llm.py | 2 +- python/mall/llmvec.py | 6 +- python/pyproject.toml | 2 +- reference/LlmVec.qmd | 169 +++++++++-------- reference/MallFrame.qmd | 175 ++++++++++-------- reference/_api_index.qmd | 2 +- reference/index.qmd | 16 +- 13 files changed, 222 insertions(+), 185 deletions(-) diff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json index 61cfa33..a847328 100644 --- a/_freeze/index/execute-results/html.json +++ b/_freeze/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "24845706e96c60bd60659a54c458a8e2", + "hash": "caaf0a8a4e7d66d2670b9f92dba16bf0", "result": { "engine": "knitr", - "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\n\n[![PyPi](https://img.shields.io/pypi/v/mlverse-mall)](https://pypi.org/project/mlverse-mall/) [![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) \\| \"CRAN [![R check](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml) \\| [![Package coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main)\n\n\n\nUse Large Language Models (LLM) to run Natural Language Processing (NLP) \noperations against your data. It takes advantage of the LLMs general language\ntraining in order to get the predictions, thus removing the need to train a new\nNLP model. `mall` is available for R and Python.\n\nIt works by running multiple LLM predictions against your data. The predictions\nare processed row-wise over a specified column. It relies on the \"one-shot\" \nprompt technique to instruct the LLM on a particular NLP operation to perform. \nThe package includes prompts to perform the following specific NLP operations:\n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Verify that something is true](#verify) about the text (binary)\n\nFor other NLP operations, `mall` offers the ability for you to [write your own prompt](#custom-prompt).\n\n\n\nIn **R** The functions inside `mall` are designed to easily work with piped \ncommands, such as `dplyr`.\n\n``` r\nreviews |>\n llm_sentiment(review)\n```\n\n\n\nIn **Python**, `mall` is a library extension to [Polars](https://pola.rs/).\n\n``` python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find new ways to help data scientists use LLMs in their daily work.\nUnlike the familiar interfaces, such as chatting and code completion, this \ninterface runs your text data directly against the LLM. This package is inspired\nby the SQL AI functions now offered by vendors such as [Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and\nprovide surprisingly accurate predictions. This saves the data scientist the \nneed to write and tune an NLP model.\n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run in \nyour machine with good accuracy. It also makes it possible to take \nadvantage of LLMs at your institution, since the data will not leave the \ncorporate network. Additionally, LLM management and integration platforms, such\nas [Ollama](https://ollama.com/), are now very easy to setup and use. `mall`\nuses Ollama as to interact with local LLMs.\n\nThe development version of `mall` lets you **use external LLMs such as\n[OpenAI](https://openai.com/), [Gemini](https://gemini.google.com/) and\n[Anthropic](https://www.anthropic.com/)**. In R, `mall` uses the\n[`ellmer`](https://ellmer.tidyverse.org/index.html)\npackage to integrate with the external LLM, and the \n[`chatlas`](https://posit-dev.github.io/chatlas/) package to integrate in Python.\n\n## Install `mall` {#get-started}\n\nInstall the package to get started:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nOfficial version from CRAN:\n\n``` r\ninstall.packages(\"mall\")\n```\n\nDevelopment version from GitHub *(required for remote LLM integration)*:\n\n``` r\npak::pak(\"mlverse/mall/r\")\n```\n\n## Python\n\nOfficial version from PyPi:\n\n``` python\npip install mlverse-mall\n```\n\nDevelopment version from GitHub:\n\n``` python\npip install \"mlverse-mall @ git+https://git@github.com/mlverse/mall.git#subdirectory=python\"\n```\n:::\n\n## Setup the LLM\n\nChoose one of the two following options to setup LLM connectivity:\n\n### Local LLMs, via Ollama {#local-llms}\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n- Install Ollama in your machine. The `ollamar` package's website provides this \n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using \nLlama 3.2 to test. To get that model you can run:\n\n ``` r\n ollamar::pull(\"llama3.2\")\n ```\n\n## Python\n\n- Install the official Ollama library\n\n ``` python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package\nusing Llama 3.2 to test. To get that model you can run:\n\n ``` python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n### Remote LLMs {#remote-llms}\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` uses the `ellmer` package as the integration point to the LLM. This package supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install `ellmer`\n\n ``` r\n install.packages(\"ellmer\")\n ```\n\n- Refer to `ellmer`'s documentation to find out how to setup the connections with your selected provider: \n\n- Let `mall` know which `ellmer` object to use during the R session. To do this, call `llm_use()`. Here is an example of using OpenAI:\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n library(mall)\n library(ellmer)\n chat <- chat_openai()\n #> Using model = \"gpt-4.1\".\n llm_use(chat)\n #> \n #> ── mall session object \n #> Backend: ellmerLLM session: model:gpt-4.1R session:\n #> cache_folder:/var/folders/y_/f_0cx_291nl0s8h26t4jg6ch0000gp/T//RtmpsrRw39/_mall_cache8057183e82ae\n ```\n :::\n\n\n**Set a default LLM for your R session**\n\nAs a convenience, `mall` is able to automatically establish a connection with the\nLLM at the beginning o R session. To do this you can use the `.mall_chat` option:\n\n```r\noptions(.mall_chat = ellmer::chat_openai(model = \"gpt-4o\"))\n```\n\nAdd this line to your *.Rprofile* file in order for that code to run every time\nyou start R. You can call `usethis::edit_r_profile()` to open your .Rprofile\nfile so you can add the option. \n\n## Python\n\n`mall` uses the `chatlas` package as the integration point to the LLM. This \npackage supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install the `chatlas` library\n\n ``` python\n pip install chatlas\n ```\n\n- Refer to `chatlas`'s documentation to find out how to setup the connections\nwith your selected provider: \n\n- Let `mall` know which `chatlas` object to use during the Python session. \nTo do this, call `llm_use()`. Here is an example of using OpenAI:\n\n ``` python\n import mall\n from chatlas import ChatOpenAI\n\n chat = ChatOpenAI()\n\n data = mall.MallData\n reviews = data.reviews\n\n reviews.llm.use(chat)\n ```\n:::\n\n## LLM functions\n\nWe will start with loading a very small data set contained in `mall`. It has \n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 Ɨ 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n## Python\n\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \ndata = mall.MallData\nreviews = data.reviews\n\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
review
"This has been the best TV I've ever used. Great screen, and sound."
"I regret buying this laptop. It is too slow and the keyboard is too noisy"
"Not sure how to feel about my new washing machine. Great color, but hard to figure"
\n```\n\n:::\n:::\n\n:::\n\n\n\n### Sentiment {#sentiment}\n\nAutomatically returns \"positive\", \"negative\", or \"neutral\" based on the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 Ɨ 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_sentiment.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsentiment
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.sentiment)\n:::\n\n### Summarize {#summarize}\n\nThere may be a need to reduce the number of words in a given text. Typically to \nmake it easier to understand its intent. The function has an argument to control \nthe maximum number of words to output (`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 Ɨ 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… great tv with good features \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_summarize.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsummary
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.summarize)\n:::\n\n### Classify {#classify}\n\nUse the LLM to categorize the text into one of the options you provide:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 Ɨ 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_classify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewclassify
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.classify)\n:::\n\n### Extract {#extract}\n\nOne of the most interesting use cases Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request \nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that \nin the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 Ɨ 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_extract.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewextract
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.extract)\n:::\n\n### Verify {#verify}\n\nThis functions allows you to check and see if a statement is true, based on the\nprovided text. By default, it will return a 1 for \"yes\", and 0 for \"no\". This \ncan be customized.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_verify(review, \"is the customer happy with the purchase\")\n#> # A tibble: 3 Ɨ 2\n#> review .verify\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1 \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too n… 0 \n#> 3 Not sure how to feel about my new washing machine. Great color, but h… 0\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_verify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.verify(\"review\", \"is the customer happy with the purchase\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewverify
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.verify)\n:::\n\n### Translate {#translate}\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The \ntranslation accuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 Ɨ 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_translate.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewtranslation
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.translate)\n:::\n\n### Custom prompt {#custom-prompt}\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 Ɨ 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_custom.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmy_prompt = (\n \"Answer a question.\"\n \"Return only the answer, no explanation\"\n \"Acceptable answers are 'yes', 'no'\"\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewcustom
"This has been the best TV I've ever used. Great screen, and sound."null
"I regret buying this laptop. It is too slow and the keyboard is too noisy"null
"Not sure how to feel about my new washing machine. Great color, but hard to figure"null
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.custom)\n:::\n\n## Model selection and settings\n\n#### Local LLMs via Ollama {#settings-local}\n\nYou can set the model and its options to use when calling the LLM. In this case,\nwe refer to options as model specific things that can be set, such as seed or \ntemperature.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nInvoking an `llm` function will automatically initialize a model selection if \nyou don't have one selected yet. If there is only one option, it will pre-select\nit for you. If there are more than one available models, then `mall` will \npresent you as menu selection so you can select which model you wish to use.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the function\nthat actually runs the prediction. In the case of Ollama, that function is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html).\n\nThe model to use, and other options can be set for the current R session\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n## Python\n\nThe model and options to be used will be defined at the Polars data frame object \nlevel. If not passed, the default model will be **llama3.2**.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(\"ollama\", \"llama3.2\", options = dict(seed = 100))\n```\n:::\n\n:::\n\n#### Remote LLMs\n\nThe provider and model selection will be based on the chat object you create. \nAny model related setting, such as temperature, seed and others, should be\nset at the time of the object creation as well.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\nlibrary(ellmer)\nchat <- chat_openai(model = \"gpt-4o\", seed = 100)\nllm_use(chat)\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall\nfrom chatlas import ChatOpenAI\nchat = ChatOpenAI(model = \"gpt-4o\", seed= 100)\ndata = mall.MallData\nreviews = data.reviews\nreviews.llm.use(chat)\n```\n:::\n\n:::\n\n\n## Results caching\n\nBy default `mall` caches the requests and corresponding results from a given \nLLM run. Each response is saved as individual JSON files. By default, the folder\nname is `_mall_cache`. The folder name can be customized, if needed. Also, the\ncaching can be turned off by setting the argument to empty (`\"\"`).\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"_my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"\")\n```\n:::\n\n:::\n\nFor more information see the [Caching Results](articles/caching.qmd) article.\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general \nmodel. It was fitted using a vast amount of data. So determining a response for\neach row, takes longer than if using a manually created NLP model. The default\nmodel used in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), which \nwas fitted using 3B parameters.\n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a \nlot of data to be evaluated.\n\nAnother consideration is the novelty of this approach. Early tests are providing\nencouraging results. But you, as an user, will still need to keep in mind that \nthe predictions will not be infallible, so always check the output. At this time,\nI think the best use for this method, is for a quick analysis.\n\n## Vector functions\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n\n\n## Python \n\n`mall` is also able to process vectors contained in a `list` object. This allows\nus to avoid having to convert a list of texts without having to first convert\nthem into a single column data frame. To use, initialize a new `LlmVec` class\nobject with either an Ollama model, or a `chatlas` `Chat` object, and then\naccess the same NLP functions as the Polars extension.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# Initialize a Chat object\nfrom chatlas import ChatOllama\nchat = ChatOllama(model = \"llama3.2\")\n\n# Pass it to a new LlmVec\nfrom mall import LlmVec\nllm = LlmVec(chat) \n```\n:::\n\n\nAccess the functions via the new LlmVec object, and pass the text to be processed.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nllm.sentiment([\"I am happy\", \"I am sad\"])\n#> ['positive', 'negative']\n```\n:::\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nllm.translate([\"Este es el mejor dia!\"], \"english\")\n#> ['This is the best day!']\n```\n:::\n\n\n\n:::\n\n", + "markdown": "---\nformat:\n html:\n toc: true\nexecute:\n eval: true\n freeze: true\n---\n\n\n\n\n\n\n\n\n\n[![PyPi](https://img.shields.io/pypi/v/mlverse-mall)](https://pypi.org/project/mlverse-mall/) [![Python tests](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/python-tests.yaml) \\| \"CRAN [![R check](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlverse/mall/actions/workflows/R-CMD-check.yaml) \\| [![Package coverage](https://codecov.io/gh/mlverse/mall/branch/main/graph/badge.svg)](https://app.codecov.io/gh/mlverse/mall?branch=main)\n\n\n\nUse Large Language Models (LLM) to run Natural Language Processing (NLP) \noperations against your data. It takes advantage of the LLMs general language\ntraining in order to get the predictions, thus removing the need to train a new\nNLP model. `mall` is available for R and Python.\n\nIt works by running multiple LLM predictions against your data. The predictions\nare processed row-wise over a specified column. It relies on the \"one-shot\" \nprompt technique to instruct the LLM on a particular NLP operation to perform. \nThe package includes prompts to perform the following specific NLP operations:\n\n- [Sentiment analysis](#sentiment)\n- [Text summarizing](#summarize)\n- [Classify text](#classify)\n- [Extract one, or several](#extract), specific pieces information from the text\n- [Translate text](#translate)\n- [Verify that something is true](#verify) about the text (binary)\n\nFor other NLP operations, `mall` offers the ability for you to [write your own prompt](#custom-prompt).\n\n\n\nIn **R** The functions inside `mall` are designed to easily work with piped \ncommands, such as `dplyr`.\n\n``` r\nreviews |>\n llm_sentiment(review)\n```\n\n\n\nIn **Python**, `mall` is a library extension to [Polars](https://pola.rs/).\n\n``` python\nreviews.llm.sentiment(\"review\")\n```\n\n## Motivation\n\nWe want to new find new ways to help data scientists use LLMs in their daily work.\nUnlike the familiar interfaces, such as chatting and code completion, this \ninterface runs your text data directly against the LLM. This package is inspired\nby the SQL AI functions now offered by vendors such as [Databricks](https://docs.databricks.com/en/large-language-models/ai-functions.html) \nand Snowflake. \n\nThe LLM's flexibility, allows for it to adapt to the subject of your data, and\nprovide surprisingly accurate predictions. This saves the data scientist the \nneed to write and tune an NLP model.\n\nIn recent times, the capabilities of LLMs that can run locally in your computer \nhave increased dramatically. This means that these sort of analysis can run in \nyour machine with good accuracy. It also makes it possible to take \nadvantage of LLMs at your institution, since the data will not leave the \ncorporate network. Additionally, LLM management and integration platforms, such\nas [Ollama](https://ollama.com/), are now very easy to setup and use. `mall`\nuses Ollama as to interact with local LLMs.\n\nThe development version of `mall` lets you **use external LLMs such as\n[OpenAI](https://openai.com/), [Gemini](https://gemini.google.com/) and\n[Anthropic](https://www.anthropic.com/)**. In R, `mall` uses the\n[`ellmer`](https://ellmer.tidyverse.org/index.html)\npackage to integrate with the external LLM, and the \n[`chatlas`](https://posit-dev.github.io/chatlas/) package to integrate in Python.\n\n## Install `mall` {#get-started}\n\nInstall the package to get started:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nOfficial version from CRAN:\n\n``` r\ninstall.packages(\"mall\")\n```\n\nDevelopment version from GitHub *(required for remote LLM integration)*:\n\n``` r\npak::pak(\"mlverse/mall/r\")\n```\n\n## Python\n\nOfficial version from PyPi:\n\n``` python\npip install mlverse-mall\n```\n\nDevelopment version from GitHub:\n\n``` python\npip install \"mlverse-mall @ git+https://git@github.com/mlverse/mall.git#subdirectory=python\"\n```\n:::\n\n## Setup the LLM\n\nChoose one of the two following options to setup LLM connectivity:\n\n### Local LLMs, via Ollama {#local-llms}\n\n- [Download Ollama from the official website](https://ollama.com/download)\n\n- Install and start Ollama in your computer\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n- Install Ollama in your machine. The `ollamar` package's website provides this \n[Installation guide](https://hauselin.github.io/ollama-r/#installation)\n\n- Download an LLM model. For example, I have been developing this package using \nLlama 3.2 to test. To get that model you can run:\n\n ``` r\n ollamar::pull(\"llama3.2\")\n ```\n\n## Python\n\n- Install the official Ollama library\n\n ``` python\n pip install ollama\n ```\n\n- Download an LLM model. For example, I have been developing this package\nusing Llama 3.2 to test. To get that model you can run:\n\n ``` python\n import ollama\n ollama.pull('llama3.2')\n ```\n:::\n\n### Remote LLMs {#remote-llms}\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` uses the `ellmer` package as the integration point to the LLM. This package supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install `ellmer`\n\n ``` r\n install.packages(\"ellmer\")\n ```\n\n- Refer to `ellmer`'s documentation to find out how to setup the connections with your selected provider: \n\n- Let `mall` know which `ellmer` object to use during the R session. To do this, call `llm_use()`. Here is an example of using OpenAI:\n\n\n ::: {.cell}\n \n ```{.r .cell-code}\n library(mall)\n library(ellmer)\n chat <- chat_openai()\n #> Using model = \"gpt-4.1\".\n llm_use(chat)\n #> \n #> ── mall session object \n #> Backend: ellmerLLM session: model:gpt-4.1R session:\n #> cache_folder:/var/folders/y_/f_0cx_291nl0s8h26t4jg6ch0000gp/T//RtmpHpDogo/_mall_cacheb8cf19ff4806\n ```\n :::\n\n\n**Set a default LLM for your R session**\n\nAs a convenience, `mall` is able to automatically establish a connection with the\nLLM at the beginning o R session. To do this you can use the `.mall_chat` option:\n\n```r\noptions(.mall_chat = ellmer::chat_openai(model = \"gpt-4o\"))\n```\n\nAdd this line to your *.Rprofile* file in order for that code to run every time\nyou start R. You can call `usethis::edit_r_profile()` to open your .Rprofile\nfile so you can add the option. \n\n## Python\n\n`mall` uses the `chatlas` package as the integration point to the LLM. This \npackage supports multiple providers such as OpenAI, Anthropic, Google Gemini, etc.\n\n- Install the `chatlas` library\n\n ``` python\n pip install chatlas\n ```\n\n- Refer to `chatlas`'s documentation to find out how to setup the connections\nwith your selected provider: \n\n- Let `mall` know which `chatlas` object to use during the Python session. \nTo do this, call `llm_use()`. Here is an example of using OpenAI:\n\n ``` python\n import mall\n from chatlas import ChatOpenAI\n\n chat = ChatOpenAI()\n\n data = mall.MallData\n reviews = data.reviews\n\n reviews.llm.use(chat)\n ```\n:::\n\n## LLM functions\n\nWe will start with loading a very small data set contained in `mall`. It has \n3 product reviews that we will use as the source of our examples.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\ndata(\"reviews\")\n\nreviews\n#> # A tibble: 3 Ɨ 1\n#> review \n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noisy \n#> 3 Not sure how to feel about my new washing machine. Great color, but hard to f…\n```\n:::\n\n\n## Python\n\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall \ndata = mall.MallData\nreviews = data.reviews\n\nreviews \n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
review
"This has been the best TV I've ever used. Great screen, and sound."
"I regret buying this laptop. It is too slow and the keyboard is too noisy"
"Not sure how to feel about my new washing machine. Great color, but hard to figure"
\n```\n\n:::\n:::\n\n:::\n\n### Sentiment {#sentiment}\n\nAutomatically returns \"positive\", \"negative\", or \"neutral\" based on the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_sentiment(review)\n#> # A tibble: 3 Ɨ 2\n#> review .sentiment\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. positive \n#> 2 I regret buying this laptop. It is too slow and the keyboard is to… negative \n#> 3 Not sure how to feel about my new washing machine. Great color, bu… neutral\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_sentiment.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.sentiment(\"review\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsentiment
"This has been the best TV I've ever used. Great screen, and sound.""positive"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""negative"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""negative"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.sentiment)\n:::\n\n### Summarize {#summarize}\n\nThere may be a need to reduce the number of words in a given text. Typically to \nmake it easier to understand its intent. The function has an argument to control \nthe maximum number of words to output (`max_words`):\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_summarize(review, max_words = 5)\n#> # A tibble: 3 Ɨ 2\n#> review .summary \n#> \n#> 1 This has been the best TV I've ever used. Gr… great tv with good features \n#> 2 I regret buying this laptop. It is too slow … laptop purchase was a mistake \n#> 3 Not sure how to feel about my new washing ma… having mixed feelings about it\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_summarize.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.summarize(\"review\", 5)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewsummary
"This has been the best TV I've ever used. Great screen, and sound.""best tv ever purchased"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""laptop not up to expectations"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""uncertain about new washer"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.summarize)\n:::\n\n### Classify {#classify}\n\nUse the LLM to categorize the text into one of the options you provide:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_classify(review, c(\"appliance\", \"computer\"))\n#> # A tibble: 3 Ɨ 2\n#> review .classify\n#> \n#> 1 This has been the best TV I've ever used. Gr… computer \n#> 2 I regret buying this laptop. It is too slow … computer \n#> 3 Not sure how to feel about my new washing ma… appliance\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_classify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.classify(\"review\", [\"computer\", \"appliance\"])\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewclassify
"This has been the best TV I've ever used. Great screen, and sound.""appliance"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""appliance"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""appliance"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.classify)\n:::\n\n### Extract {#extract}\n\nOne of the most interesting use cases Using natural language, we can tell the \nLLM to return a specific part of the text. In the following example, we request \nthat the LLM return the product being referred to. We do this by simply saying \n\"product\". The LLM understands what we *mean* by that word, and looks for that \nin the text.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_extract(review, \"product\")\n#> # A tibble: 3 Ɨ 2\n#> review .extract \n#> \n#> 1 This has been the best TV I've ever used. Gr… tv \n#> 2 I regret buying this laptop. It is too slow … laptop \n#> 3 Not sure how to feel about my new washing ma… washing machine\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_extract.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.extract(\"review\", \"product\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewextract
"This has been the best TV I've ever used. Great screen, and sound.""tv"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""laptop"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""washing machine"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.extract)\n:::\n\n### Verify {#verify}\n\nThis functions allows you to check and see if a statement is true, based on the\nprovided text. By default, it will return a 1 for \"yes\", and 0 for \"no\". This \ncan be customized.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_verify(review, \"is the customer happy with the purchase\")\n#> # A tibble: 3 Ɨ 2\n#> review .verify\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. 1 \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too n… 0 \n#> 3 Not sure how to feel about my new washing machine. Great color, but h… 0\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_verify.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.verify(\"review\", \"is the customer happy with the purchase\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewverify
"This has been the best TV I've ever used. Great screen, and sound."1
"I regret buying this laptop. It is too slow and the keyboard is too noisy"0
"Not sure how to feel about my new washing machine. Great color, but hard to figure"0
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.verify)\n:::\n\n### Translate {#translate}\n\nAs the title implies, this function will translate the text into a specified \nlanguage. What is really nice, it is that you don't need to specify the language\nof the source text. Only the target language needs to be defined. The \ntranslation accuracy will depend on the LLM\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nreviews |>\n llm_translate(review, \"spanish\")\n#> # A tibble: 3 Ɨ 2\n#> review .translation \n#> \n#> 1 This has been the best TV I've ever used. Gr… Esta ha sido la mejor televisió…\n#> 2 I regret buying this laptop. It is too slow … Me arrepiento de comprar este p…\n#> 3 Not sure how to feel about my new washing ma… No estoy seguro de cómo me sien…\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_translate.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.translate(\"review\", \"spanish\")\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewtranslation
"This has been the best TV I've ever used. Great screen, and sound.""Esta ha sido la mejor TV que he utilizado. Gran pantalla y sonido."
"I regret buying this laptop. It is too slow and the keyboard is too noisy""Lamento haber comprado este portƔtil. EstƔ demasiado lento y la tecla es demasiado ruidosa."
"Not sure how to feel about my new washing machine. Great color, but hard to figure""No estoy seguro de cómo sentirme con mi nueva lavadora. Bonito color, pero difícil de entender."
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.translate)\n:::\n\n### Custom prompt {#custom-prompt}\n\nIt is possible to pass your own prompt to the LLM, and have `mall` run it \nagainst each text entry:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmy_prompt <- paste(\n \"Answer a question.\",\n \"Return only the answer, no explanation\",\n \"Acceptable answers are 'yes', 'no'\",\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews |>\n llm_custom(review, my_prompt)\n#> # A tibble: 3 Ɨ 2\n#> review .pred\n#> \n#> 1 This has been the best TV I've ever used. Great screen, and sound. Yes \n#> 2 I regret buying this laptop. It is too slow and the keyboard is too noi… No \n#> 3 Not sure how to feel about my new washing machine. Great color, but har… No\n```\n:::\n\n\nFor more information and examples visit this function's [R reference page](reference/llm_custom.qmd)\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmy_prompt = (\n \"Answer a question.\"\n \"Return only the answer, no explanation\"\n \"Acceptable answers are 'yes', 'no'\"\n \"Answer this about the following text, is this a happy customer?:\"\n)\n\nreviews.llm.custom(\"review\", prompt = my_prompt)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
reviewcustom
"This has been the best TV I've ever used. Great screen, and sound.""Yes"
"I regret buying this laptop. It is too slow and the keyboard is too noisy""No"
"Not sure how to feel about my new washing machine. Great color, but hard to figure""No"
\n```\n\n:::\n:::\n\n\nFor more information and examples visit this function's [Python reference page](reference/MallFrame.qmd#mall.MallFrame.custom)\n:::\n\n## Model selection and settings\n\n#### Local LLMs via Ollama {#settings-local}\n\nYou can set the model and its options to use when calling the LLM. In this case,\nwe refer to options as model specific things that can be set, such as seed or \ntemperature.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nInvoking an `llm` function will automatically initialize a model selection if \nyou don't have one selected yet. If there is only one option, it will pre-select\nit for you. If there are more than one available models, then `mall` will \npresent you as menu selection so you can select which model you wish to use.\n\nCalling `llm_use()` directly will let you specify the model and backend to use.\nYou can also setup additional arguments that will be passed down to the function\nthat actually runs the prediction. In the case of Ollama, that function is [`chat()`](https://hauselin.github.io/ollama-r/reference/chat.html).\n\nThe model to use, and other options can be set for the current R session\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(\"ollama\", \"llama3.2\", seed = 100, temperature = 0)\n```\n:::\n\n\n## Python\n\nThe model and options to be used will be defined at the Polars data frame object \nlevel. If not passed, the default model will be **llama3.2**.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(\"ollama\", \"llama3.2\", options = dict(seed = 100))\n```\n:::\n\n:::\n\n#### Remote LLMs\n\nThe provider and model selection will be based on the chat object you create. \nAny model related setting, such as temperature, seed and others, should be\nset at the time of the object creation as well.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(mall)\nlibrary(ellmer)\nchat <- chat_openai(model = \"gpt-4o\", seed = 100)\nllm_use(chat)\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nimport mall\nfrom chatlas import ChatOpenAI\nchat = ChatOpenAI(model = \"gpt-4o\", seed= 100)\ndata = mall.MallData\nreviews = data.reviews\nreviews.llm.use(chat)\n```\n:::\n\n:::\n\n\n## Results caching\n\nBy default `mall` caches the requests and corresponding results from a given \nLLM run. Each response is saved as individual JSON files. By default, the folder\nname is `_mall_cache`. The folder name can be customized, if needed. Also, the\ncaching can be turned off by setting the argument to empty (`\"\"`).\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"_my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_use(.cache = \"\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"my_cache\")\n```\n:::\n\n\nTo turn off:\n\n\n::: {.cell}\n\n```{.python .cell-code}\nreviews.llm.use(_cache = \"\")\n```\n:::\n\n:::\n\nFor more information see the [Caching Results](articles/caching.qmd) article.\n\n## Key considerations\n\nThe main consideration is **cost**. Either, time cost, or money cost.\n\nIf using this method with an LLM locally available, the cost will be a long \nrunning time. Unless using a very specialized LLM, a given LLM is a general \nmodel. It was fitted using a vast amount of data. So determining a response for\neach row, takes longer than if using a manually created NLP model. The default\nmodel used in Ollama is [Llama 3.2](https://ollama.com/library/llama3.2), which \nwas fitted using 3B parameters.\n\nIf using an external LLM service, the consideration will need to be for the \nbilling costs of using such service. Keep in mind that you will be sending a \nlot of data to be evaluated.\n\nAnother consideration is the novelty of this approach. Early tests are providing\nencouraging results. But you, as an user, will still need to keep in mind that \nthe predictions will not be infallible, so always check the output. At this time,\nI think the best use for this method, is for a quick analysis.\n\n## Vector functions\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n`mall` includes functions that expect a vector, instead of a table, to run the\npredictions. This should make it easier to test things, such as custom prompts\nor results of specific text. Each `llm_` function has a corresponding `llm_vec_`\nfunction:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_sentiment(\"I am happy\")\n#> [1] \"positive\"\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nllm_vec_translate(\"Este es el mejor dia!\", \"english\")\n#> [1] \"It's the best day!\"\n```\n:::\n\n\n## Python \n\n`mall` is also able to process vectors contained in a `list` object. This allows\nus to avoid having to convert a list of texts without having to first convert\nthem into a single column data frame. To use, initialize a new `LLMVec` class\nobject with either an Ollama model, or a `chatlas` `Chat` object, and then\naccess the same NLP functions as the Polars extension.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# Initialize a Chat object\nfrom chatlas import ChatOllama\nchat = ChatOllama(model = \"llama3.2\")\n\n# Pass it to a new LLMVec\nfrom mall import LLMVec\nllm = LLMVec(chat) \n```\n:::\n\n\nAccess the functions via the new LLMVec object, and pass the text to be processed.\n\n\n::: {.cell}\n\n```{.python .cell-code}\nllm.sentiment([\"I am happy\", \"I am sad\"])\n#> ['positive', 'negative']\n```\n:::\n\n\n\n::: {.cell}\n\n```{.python .cell-code}\nllm.translate([\"Este es el mejor dia!\"], \"english\")\n#> [\"It's the best day!\"]\n```\n:::\n\n\n\n:::\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/_freeze/reference/LlmVec/execute-results/html.json b/_freeze/reference/LlmVec/execute-results/html.json index b66eefc..0b493ac 100644 --- a/_freeze/reference/LlmVec/execute-results/html.json +++ b/_freeze/reference/LlmVec/execute-results/html.json @@ -1,10 +1,10 @@ { - "hash": "d14e6fbb48f3ed7d5b6e00d8eff1914e", + "hash": "db8aa962358a8674ed6a69098f0ff7ea", "result": { "engine": "jupyter", - "markdown": "---\ntitle: LlmVec\n---\n\n\n\n`LlmVec(self, backend='', model='', _cache='_mall_cache', **kwargs)`\n\nClass that adds ability to use an LLM to run batch predictions\n\n\n\n::: {#83f06b2c .cell execution_count=2}\n``` {.python .cell-code}\nfrom chatlas import ChatOllama\nfrom mall import LlmVec\n\nchat = ChatOllama(model = \"llama3.2\")\n\nllm = LlmVec(chat) \n```\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [classify](#mall.LlmVec.classify) | Classify text into specific categories. |\n| [custom](#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. |\n| [extract](#mall.LlmVec.extract) | Pull a specific label from the text. |\n| [sentiment](#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis |\n| [summarize](#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. |\n| [translate](#mall.LlmVec.translate) | Translate text into another language. |\n| [verify](#mall.LlmVec.verify) | Check to see if something is true about the text. |\n\n### classify { #mall.LlmVec.classify }\n\n`LlmVec.classify(x, labels='', additional='')`\n\nClassify text into specific categories.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#730755cf .cell execution_count=3}\n``` {.python .cell-code}\nllm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent'])\n```\n\n::: {.cell-output .cell-output-display execution_count=9}\n```\n['urgent', None]\n```\n:::\n:::\n\n\n### custom { #mall.LlmVec.custom }\n\n`LlmVec.custom(x, prompt='', valid_resps='')`\n\nProvide the full prompt that the LLM will process.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|----------|--------|----------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` |\n\n### extract { #mall.LlmVec.extract }\n\n`LlmVec.extract(x, labels='', additional='')`\n\nPull a specific label from the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|--------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#def9cfbf .cell execution_count=4}\n``` {.python .cell-code}\nllm.extract(['bob smith, 123 3rd street'], labels=['name', 'address'])\n```\n\n::: {.cell-output .cell-output-display execution_count=10}\n```\n['| bob smith | 123 3rd street |']\n```\n:::\n:::\n\n\n### sentiment { #mall.LlmVec.sentiment }\n\n`LlmVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')`\n\nUse an LLM to run a sentiment analysis\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------------|----------------------------------------------------------------|---------------------------------------|\n| `x` | list | A list of texts | _required_ |\n| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#567dc847 .cell execution_count=5}\n``` {.python .cell-code}\nllm.sentiment(['I am happy', 'I am sad'])\n```\n\n::: {.cell-output .cell-output-display execution_count=11}\n```\n['positive', 'negative']\n```\n:::\n:::\n\n\n### summarize { #mall.LlmVec.summarize }\n\n`LlmVec.summarize(x, max_words=10, additional='')`\n\nSummarize the text down to a specific number of words.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|---------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `max_words` | int | Maximum number of words to use for the summary | `10` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#9c92779d .cell execution_count=6}\n``` {.python .cell-code}\nllm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5)\n```\n\n::: {.cell-output .cell-output-display execution_count=12}\n```\n['this tv has exceeded expectations']\n```\n:::\n:::\n\n\n### translate { #mall.LlmVec.translate }\n\n`LlmVec.translate(x, language='', additional='')`\n\nTranslate text into another language.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `language` | str | The target language to translate to. For example 'French'. | `''` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples\n\n::: {#d41bc3b8 .cell execution_count=7}\n``` {.python .cell-code}\nllm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish')\n```\n\n::: {.cell-output .cell-output-display execution_count=13}\n```\n['Esto ha sido la mejor televisión que he tenido, gran pantalla y sonido.']\n```\n:::\n:::\n\n\n### verify { #mall.LlmVec.verify }\n\n`LlmVec.verify(x, what='', yes_no=[1, 0], additional='')`\n\nCheck to see if something is true about the text.\n\n#### Parameters\n\n| Name | Type | Description | Default |\n|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| `x` | list | A list of texts | _required_ |\n| `what` | str | The statement or question that needs to be verified against the provided text | `''` |\n| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` |\n| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n", + "markdown": "---\ntitle: LLMVec\n---\n\n\n\n```python\nLLMVec(backend='', model='', _cache='_mall_cache', **kwargs)\n```\n\nClass that adds ability to use an LLM to run batch predictions\n\n\n::: {#caadb44f .cell execution_count=1}\n``` {.python .cell-code}\nfrom chatlas import ChatOllama\nfrom mall import LLMVec\n\nchat = ChatOllama(model = \"llama3.2\")\n\nllm = LLMVec(chat) \n```\n:::\n\n\n## Methods\n\n| Name | Description |\n| --- | --- |\n| [classify](#mall.LLMVec.classify) | Classify text into specific categories. |\n| [custom](#mall.LLMVec.custom) | Provide the full prompt that the LLM will process. |\n| [extract](#mall.LLMVec.extract) | Pull a specific label from the text. |\n| [sentiment](#mall.LLMVec.sentiment) | Use an LLM to run a sentiment analysis |\n| [summarize](#mall.LLMVec.summarize) | Summarize the text down to a specific number of words. |\n| [translate](#mall.LLMVec.translate) | Translate text into another language. |\n| [verify](#mall.LLMVec.verify) | Check to see if something is true about the text. |\n\n### classify { #mall.LLMVec.classify }\n\n```python\nLLMVec.classify(x, labels='', additional='')\n```\n\nClassify text into specific categories.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------|\n| x | list | A list of texts | _required_ |\n| labels | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` |\n| additional | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#f0de90cb .cell execution_count=2}\n``` {.python .cell-code}\nllm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent'])\n```\n\n::: {.cell-output .cell-output-display execution_count=2}\n```\n['urgent', None]\n```\n:::\n:::\n\n\n### custom { #mall.LLMVec.custom }\n\n```python\nLLMVec.custom(x, prompt='', valid_resps='')\n```\n\nProvide the full prompt that the LLM will process.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|--------|--------|----------------------------------------------------|------------|\n| x | list | A list of texts | _required_ |\n| prompt | str | The prompt to send to the LLM along with the `col` | `''` |\n\n### extract { #mall.LLMVec.extract }\n\n```python\nLLMVec.extract(x, labels='', additional='')\n```\n\nPull a specific label from the text.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------|--------|--------------------------------------------------------------------------------|------------|\n| x | list | A list of texts | _required_ |\n| labels | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` |\n| additional | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#49687301 .cell execution_count=3}\n``` {.python .cell-code}\nllm.extract(['bob smith, 123 3rd street'], labels=['name', 'address'])\n```\n\n::: {.cell-output .cell-output-display execution_count=3}\n```\n['| bob smith | 123 3rd street |']\n```\n:::\n:::\n\n\n### sentiment { #mall.LLMVec.sentiment }\n\n```python\nLLMVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')\n```\n\nUse an LLM to run a sentiment analysis\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------|--------------|----------------------------------------------------------------|---------------------------------------|\n| x | list | A list of texts | _required_ |\n| options | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` |\n| additional | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#af1bc6cc .cell execution_count=4}\n``` {.python .cell-code}\nllm.sentiment(['I am happy', 'I am sad'])\n```\n\n::: {.cell-output .cell-output-display execution_count=4}\n```\n['positive', 'negative']\n```\n:::\n:::\n\n\n### summarize { #mall.LLMVec.summarize }\n\n```python\nLLMVec.summarize(x, max_words=10, additional='')\n```\n\nSummarize the text down to a specific number of words.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------|--------|---------------------------------------------------|------------|\n| x | list | A list of texts | _required_ |\n| max_words | int | Maximum number of words to use for the summary | `10` |\n| additional | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#1960bc54 .cell execution_count=5}\n``` {.python .cell-code}\nllm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5)\n```\n\n::: {.cell-output .cell-output-display execution_count=5}\n```\n['this tv has exceeded expectations']\n```\n:::\n:::\n\n\n### translate { #mall.LLMVec.translate }\n\n```python\nLLMVec.translate(x, language='', additional='')\n```\n\nTranslate text into another language.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------|--------|------------------------------------------------------------|------------|\n| x | list | A list of texts | _required_ |\n| language | str | The target language to translate to. For example 'French'. | `''` |\n| additional | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n#### Examples {.doc-section .doc-section-examples}\n\n::: {#84be8518 .cell execution_count=6}\n``` {.python .cell-code}\nllm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish')\n```\n\n::: {.cell-output .cell-output-display execution_count=6}\n```\n['Esto ha sido la mejor televisión que he tenido, gran pantalla y sonido.']\n```\n:::\n:::\n\n\n### verify { #mall.LLMVec.verify }\n\n```python\nLLMVec.verify(x, what='', yes_no=[1, 0], additional='')\n```\n\nCheck to see if something is true about the text.\n\n#### Parameters {.doc-section .doc-section-parameters}\n\n| Name | Type | Description | Default |\n|------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|\n| x | list | A list of texts | _required_ |\n| what | str | The statement or question that needs to be verified against the provided text | `''` |\n| yes_no | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` |\n| additional | str | Inserts this text into the prompt sent to the LLM | `''` |\n\n", "supporting": [ - "LlmVec_files" + "LLMVec_files" ], "filters": [], "includes": {} diff --git a/_quarto.yml b/_quarto.yml index c5d7e7a..86311e8 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -66,7 +66,7 @@ quartodoc: - title: Vectors desc: '' contents: - - name: LlmVec + - name: LLMVec pkgsite: diff --git a/index.qmd b/index.qmd index 07286ee..b7c35b4 100644 --- a/index.qmd +++ b/index.qmd @@ -16,6 +16,8 @@ library(dbplyr) library(tictoc) library(DBI) source("site/knitr-print.R") + +reticulate::use_virtualenv("python/.venv") ``` ```{python} @@ -277,11 +279,6 @@ reviews ``` ::: -```{python} -#| include: false -reviews.llm.use(options = dict(seed = 100), _cache = "_readme_cache") -``` - ### Sentiment {#sentiment} Automatically returns "positive", "negative", or "neutral" based on the text. @@ -620,7 +617,7 @@ llm_vec_translate("Este es el mejor dia!", "english") `mall` is also able to process vectors contained in a `list` object. This allows us to avoid having to convert a list of texts without having to first convert -them into a single column data frame. To use, initialize a new `LlmVec` class +them into a single column data frame. To use, initialize a new `LLMVec` class object with either an Ollama model, or a `chatlas` `Chat` object, and then access the same NLP functions as the Polars extension. @@ -629,12 +626,12 @@ access the same NLP functions as the Polars extension. from chatlas import ChatOllama chat = ChatOllama(model = "llama3.2") -# Pass it to a new LlmVec -from mall import LlmVec -llm = LlmVec(chat) +# Pass it to a new LLMVec +from mall import LLMVec +llm = LLMVec(chat) ``` -Access the functions via the new LlmVec object, and pass the text to be processed. +Access the functions via the new LLMVec object, and pass the text to be processed. ```{python} llm.sentiment(["I am happy", "I am sad"]) diff --git a/objects.json b/objects.json index f381cae..67a8a41 100644 --- a/objects.json +++ b/objects.json @@ -1 +1 @@ -{"project": "mall", "version": "0.0.9999", "count": 34, "items": [{"name": "mall.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "-"}, {"name": "mall.polars.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "mall.MallFrame.classify"}, {"name": "mall.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "-"}, {"name": "mall.polars.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "mall.MallFrame.custom"}, {"name": "mall.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "-"}, {"name": "mall.polars.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "mall.MallFrame.extract"}, {"name": "mall.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "-"}, {"name": "mall.polars.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "mall.MallFrame.sentiment"}, {"name": "mall.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "-"}, {"name": "mall.polars.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "mall.MallFrame.summarize"}, {"name": "mall.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "-"}, {"name": "mall.polars.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "mall.MallFrame.translate"}, {"name": "mall.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "-"}, {"name": "mall.polars.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "mall.MallFrame.use"}, {"name": "mall.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "-"}, {"name": "mall.polars.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "mall.MallFrame.verify"}, {"name": "mall.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "-"}, {"name": "mall.polars.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "mall.MallFrame"}, {"name": "mall.LlmVec.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.classify", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.classify", "dispname": "mall.LlmVec.classify"}, {"name": "mall.LlmVec.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.custom", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.custom", "dispname": "mall.LlmVec.custom"}, {"name": "mall.LlmVec.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.extract", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.extract", "dispname": "mall.LlmVec.extract"}, {"name": "mall.LlmVec.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.sentiment", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.sentiment", "dispname": "mall.LlmVec.sentiment"}, {"name": "mall.LlmVec.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.summarize", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.summarize", "dispname": "mall.LlmVec.summarize"}, {"name": "mall.LlmVec.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.translate", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.translate", "dispname": "mall.LlmVec.translate"}, {"name": "mall.LlmVec.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.verify", "dispname": "-"}, {"name": "mall.llmvec.LlmVec.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec.verify", "dispname": "mall.LlmVec.verify"}, {"name": "mall.LlmVec", "domain": "py", "role": "class", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec", "dispname": "-"}, {"name": "mall.llmvec.LlmVec", "domain": "py", "role": "class", "priority": "1", "uri": "reference/LlmVec.html#mall.LlmVec", "dispname": "mall.LlmVec"}]} \ No newline at end of file +{"project": "mall", "version": "0.0.9999", "count": 34, "items": [{"name": "mall.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "-"}, {"name": "mall.polars.MallFrame.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.classify", "dispname": "mall.MallFrame.classify"}, {"name": "mall.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "-"}, {"name": "mall.polars.MallFrame.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.custom", "dispname": "mall.MallFrame.custom"}, {"name": "mall.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "-"}, {"name": "mall.polars.MallFrame.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.extract", "dispname": "mall.MallFrame.extract"}, {"name": "mall.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "-"}, {"name": "mall.polars.MallFrame.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.sentiment", "dispname": "mall.MallFrame.sentiment"}, {"name": "mall.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "-"}, {"name": "mall.polars.MallFrame.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.summarize", "dispname": "mall.MallFrame.summarize"}, {"name": "mall.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "-"}, {"name": "mall.polars.MallFrame.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.translate", "dispname": "mall.MallFrame.translate"}, {"name": "mall.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "-"}, {"name": "mall.polars.MallFrame.use", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.use", "dispname": "mall.MallFrame.use"}, {"name": "mall.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "-"}, {"name": "mall.polars.MallFrame.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame.verify", "dispname": "mall.MallFrame.verify"}, {"name": "mall.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "-"}, {"name": "mall.polars.MallFrame", "domain": "py", "role": "class", "priority": "1", "uri": "reference/MallFrame.html#mall.MallFrame", "dispname": "mall.MallFrame"}, {"name": "mall.LLMVec.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.classify", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.classify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.classify", "dispname": "mall.LLMVec.classify"}, {"name": "mall.LLMVec.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.custom", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.custom", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.custom", "dispname": "mall.LLMVec.custom"}, {"name": "mall.LLMVec.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.extract", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.extract", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.extract", "dispname": "mall.LLMVec.extract"}, {"name": "mall.LLMVec.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.sentiment", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.sentiment", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.sentiment", "dispname": "mall.LLMVec.sentiment"}, {"name": "mall.LLMVec.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.summarize", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.summarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.summarize", "dispname": "mall.LLMVec.summarize"}, {"name": "mall.LLMVec.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.translate", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.translate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.translate", "dispname": "mall.LLMVec.translate"}, {"name": "mall.LLMVec.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.verify", "dispname": "-"}, {"name": "mall.llmvec.LLMVec.verify", "domain": "py", "role": "function", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec.verify", "dispname": "mall.LLMVec.verify"}, {"name": "mall.LLMVec", "domain": "py", "role": "class", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec", "dispname": "-"}, {"name": "mall.llmvec.LLMVec", "domain": "py", "role": "class", "priority": "1", "uri": "reference/LLMVec.html#mall.LLMVec", "dispname": "mall.LLMVec"}]} \ No newline at end of file diff --git a/python/mall/__init__.py b/python/mall/__init__.py index 84bc232..de3669a 100644 --- a/python/mall/__init__.py +++ b/python/mall/__init__.py @@ -1,5 +1,5 @@ -__all__ = ["MallFrame", "MallData"] +__all__ = ["MallFrame", "MallData", "LLMVec"] from mall.polars import MallFrame from mall.data import MallData -from mall.llmvec import LlmVec +from mall.llmvec import LLMVec diff --git a/python/mall/llm.py b/python/mall/llm.py index b327f1a..efed256 100644 --- a/python/mall/llm.py +++ b/python/mall/llm.py @@ -81,7 +81,7 @@ def llm_call(x, msg, use, valid_resps="", convert=None, data_type=None): messages=build_msg(x, msg), options=use.get("options"), ) - + out = "" cache = "" if use.get("_cache") != "": diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 8ace983..2b25b9b 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -11,16 +11,16 @@ from mall.llm import llm_use, llm_loop -class LlmVec: +class LLMVec: """Class that adds ability to use an LLM to run batch predictions ```{python} from chatlas import ChatOllama - from mall import LlmVec + from mall import LLMVec chat = ChatOllama(model = "llama3.2") - llm = LlmVec(chat) + llm = LLMVec(chat) ``` """ def __init__(self, backend="", model="", _cache="_mall_cache", **kwargs): diff --git a/python/pyproject.toml b/python/pyproject.toml index 9a69fb8..f4b4d2d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ packages = ["mall"] [project] name = "mlverse-mall" -version = "0.1.0.9002" +version = "0.1.0.9003" description = "Run multiple 'Large Language Model' predictions against a table. The predictions run row-wise over a specified column." readme = "README.md" authors = [ diff --git a/reference/LlmVec.qmd b/reference/LlmVec.qmd index 7f09fe6..7318184 100644 --- a/reference/LlmVec.qmd +++ b/reference/LlmVec.qmd @@ -1,159 +1,170 @@ -# LlmVec { #mall.LlmVec } +# LLMVec { #mall.LLMVec } -`LlmVec(self, backend='', model='', _cache='_mall_cache', **kwargs)` +```python +LLMVec(backend='', model='', _cache='_mall_cache', **kwargs) +``` Class that adds ability to use an LLM to run batch predictions -```{python} -#| include: false -import mall -``` - ```{python} from chatlas import ChatOllama -from mall import LlmVec +from mall import LLMVec chat = ChatOllama(model = "llama3.2") -llm = LlmVec(chat) +llm = LLMVec(chat) ``` ## Methods | Name | Description | | --- | --- | -| [classify](#mall.LlmVec.classify) | Classify text into specific categories. | -| [custom](#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. | -| [extract](#mall.LlmVec.extract) | Pull a specific label from the text. | -| [sentiment](#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis | -| [summarize](#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. | -| [translate](#mall.LlmVec.translate) | Translate text into another language. | -| [verify](#mall.LlmVec.verify) | Check to see if something is true about the text. | - -### classify { #mall.LlmVec.classify } - -`LlmVec.classify(x, labels='', additional='')` +| [classify](#mall.LLMVec.classify) | Classify text into specific categories. | +| [custom](#mall.LLMVec.custom) | Provide the full prompt that the LLM will process. | +| [extract](#mall.LLMVec.extract) | Pull a specific label from the text. | +| [sentiment](#mall.LLMVec.sentiment) | Use an LLM to run a sentiment analysis | +| [summarize](#mall.LLMVec.summarize) | Summarize the text down to a specific number of words. | +| [translate](#mall.LLMVec.translate) | Translate text into another language. | +| [verify](#mall.LLMVec.verify) | Check to see if something is true about the text. | + +### classify { #mall.LLMVec.classify } + +```python +LLMVec.classify(x, labels='', additional='') +``` Classify text into specific categories. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------| -| `x` | list | A list of texts | _required_ | -| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|-------------------------------------------------------------------------------------------------------------------------|------------| +| x | list | A list of texts | _required_ | +| labels | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} llm.classify(['this is important!', 'there is no rush'], ['urgent', 'not urgent']) ``` -### custom { #mall.LlmVec.custom } +### custom { #mall.LLMVec.custom } -`LlmVec.custom(x, prompt='', valid_resps='')` +```python +LLMVec.custom(x, prompt='', valid_resps='') +``` Provide the full prompt that the LLM will process. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|----------|--------|----------------------------------------------------|------------| -| `x` | list | A list of texts | _required_ | -| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` | +| Name | Type | Description | Default | +|--------|--------|----------------------------------------------------|------------| +| x | list | A list of texts | _required_ | +| prompt | str | The prompt to send to the LLM along with the `col` | `''` | -### extract { #mall.LlmVec.extract } +### extract { #mall.LLMVec.extract } -`LlmVec.extract(x, labels='', additional='')` +```python +LLMVec.extract(x, labels='', additional='') +``` Pull a specific label from the text. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|--------------------------------------------------------------------------------|------------| -| `x` | list | A list of texts | _required_ | -| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|--------------------------------------------------------------------------------|------------| +| x | list | A list of texts | _required_ | +| labels | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} llm.extract(['bob smith, 123 3rd street'], labels=['name', 'address']) ``` -### sentiment { #mall.LlmVec.sentiment } +### sentiment { #mall.LLMVec.sentiment } -`LlmVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='')` +```python +LLMVec.sentiment(x, options=['positive', 'negative', 'neutral'], additional='') +``` Use an LLM to run a sentiment analysis -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------------|----------------------------------------------------------------|---------------------------------------| -| `x` | list | A list of texts | _required_ | -| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------------|----------------------------------------------------------------|---------------------------------------| +| x | list | A list of texts | _required_ | +| options | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} llm.sentiment(['I am happy', 'I am sad']) ``` -### summarize { #mall.LlmVec.summarize } +### summarize { #mall.LLMVec.summarize } -`LlmVec.summarize(x, max_words=10, additional='')` +```python +LLMVec.summarize(x, max_words=10, additional='') +``` Summarize the text down to a specific number of words. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|---------------------------------------------------|------------| -| `x` | list | A list of texts | _required_ | -| `max_words` | int | Maximum number of words to use for the summary | `10` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|---------------------------------------------------|------------| +| x | list | A list of texts | _required_ | +| max_words | int | Maximum number of words to use for the summary | `10` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} llm.summarize(['This has been the best TV Ive ever used. Great screen, and sound.'], max_words = 5) ``` -### translate { #mall.LlmVec.translate } +### translate { #mall.LLMVec.translate } -`LlmVec.translate(x, language='', additional='')` +```python +LLMVec.translate(x, language='', additional='') +``` Translate text into another language. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|------------------------------------------------------------|------------| -| `x` | list | A list of texts | _required_ | -| `language` | str | The target language to translate to. For example 'French'. | `''` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|------------------------------------------------------------|------------| +| x | list | A list of texts | _required_ | +| language | str | The target language to translate to. For example 'French'. | `''` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} llm.translate(['This has been the best TV Ive ever used. Great screen, and sound.'], language = 'spanish') ``` -### verify { #mall.LlmVec.verify } +### verify { #mall.LLMVec.verify } -`LlmVec.verify(x, what='', yes_no=[1, 0], additional='')` +```python +LLMVec.verify(x, what='', yes_no=[1, 0], additional='') +``` Check to see if something is true about the text. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| -| `x` | list | A list of texts | _required_ | -| `what` | str | The statement or question that needs to be verified against the provided text | `''` | -| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | \ No newline at end of file +| Name | Type | Description | Default | +|------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| x | list | A list of texts | _required_ | +| what | str | The statement or question that needs to be verified against the provided text | `''` | +| yes_no | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | \ No newline at end of file diff --git a/reference/MallFrame.qmd b/reference/MallFrame.qmd index 4496d98..d01ae23 100644 --- a/reference/MallFrame.qmd +++ b/reference/MallFrame.qmd @@ -1,6 +1,8 @@ # MallFrame { #mall.MallFrame } -`MallFrame(self, df)` +```python +MallFrame(df) +``` Extension to Polars that add ability to use an LLM to run batch predictions over a data frame @@ -36,20 +38,22 @@ reviews.llm.use(options = dict(seed = 100)) ### classify { #mall.MallFrame.classify } -`MallFrame.classify(col, labels='', additional='', pred_name='classify')` +```python +MallFrame.classify(col, labels='', additional='', pred_name='classify') +``` Classify text into specific categories. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|-------------------------------------------------------------------------------------------------------------------------|--------------| -| `col` | str | The name of the text field to process | _required_ | -| `labels` | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'classify'` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|-------------------------------------------------------------------------------------------------------------------------|--------------| +| col | str | The name of the text field to process | _required_ | +| labels | list | A list or a DICT object that defines the categories to classify the text as. It will return one of the provided labels. | `''` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'classify'` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} reviews.llm.classify("review", ["appliance", "computer"]) @@ -67,19 +71,21 @@ reviews.llm.classify("review", {"appliance" : "1", "computer" : "2"}) ### custom { #mall.MallFrame.custom } -`MallFrame.custom(col, prompt='', valid_resps='', pred_name='custom')` +```python +MallFrame.custom(col, prompt='', valid_resps='', pred_name='custom') +``` Provide the full prompt that the LLM will process. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|-------------|--------|----------------------------------------------------------------------------------------|------------| -| `col` | str | The name of the text field to process | _required_ | -| `prompt` | str | The prompt to send to the LLM along with the `col` | `''` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'custom'` | +| Name | Type | Description | Default | +|-----------|--------|----------------------------------------------------------------------------------------|------------| +| col | str | The name of the text field to process | _required_ | +| prompt | str | The prompt to send to the LLM along with the `col` | `''` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'custom'` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} my_prompt = ( @@ -94,20 +100,28 @@ reviews.llm.custom("review", prompt = my_prompt) ### extract { #mall.MallFrame.extract } -`MallFrame.extract(col, labels='', expand_cols=False, additional='', pred_name='extract')` +```python +MallFrame.extract( + col, + labels='', + expand_cols=False, + additional='', + pred_name='extract', +) +``` Pull a specific label from the text. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|----------------------------------------------------------------------------------------|-------------| -| `col` | str | The name of the text field to process | _required_ | -| `labels` | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'extract'` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|----------------------------------------------------------------------------------------|-------------| +| col | str | The name of the text field to process | _required_ | +| labels | list | A list or a DICT object that defines tells the LLM what to look for and return | `''` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'extract'` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} # Use 'labels' to let the function know what to extract @@ -146,20 +160,27 @@ reviews.llm.extract( ### sentiment { #mall.MallFrame.sentiment } -`MallFrame.sentiment(col, options=['positive', 'negative', 'neutral'], additional='', pred_name='sentiment')` +```python +MallFrame.sentiment( + col, + options=['positive', 'negative', 'neutral'], + additional='', + pred_name='sentiment', +) +``` Use an LLM to run a sentiment analysis -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------------|----------------------------------------------------------------------------------------|---------------------------------------| -| `col` | str | The name of the text field to process | _required_ | -| `options` | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'sentiment'` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------------|----------------------------------------------------------------------------------------|---------------------------------------| +| col | str | The name of the text field to process | _required_ | +| options | list or dict | A list of the sentiment options to use, or a named DICT object | `['positive', 'negative', 'neutral']` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'sentiment'` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} reviews.llm.sentiment("review") @@ -182,20 +203,22 @@ reviews.llm.sentiment("review", {"positive" : 1, "negative" : 0}) ### summarize { #mall.MallFrame.summarize } -`MallFrame.summarize(col, max_words=10, additional='', pred_name='summary')` +```python +MallFrame.summarize(col, max_words=10, additional='', pred_name='summary') +``` Summarize the text down to a specific number of words. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|----------------------------------------------------------------------------------------|-------------| -| `col` | str | The name of the text field to process | _required_ | -| `max_words` | int | Maximum number of words to use for the summary | `10` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'summary'` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|----------------------------------------------------------------------------------------|-------------| +| col | str | The name of the text field to process | _required_ | +| max_words | int | Maximum number of words to use for the summary | `10` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'summary'` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} # Use max_words to set the maximum number of words to use for the summary @@ -209,20 +232,22 @@ reviews.llm.summarize("review", 5, pred_name = "review_summary") ### translate { #mall.MallFrame.translate } -`MallFrame.translate(col, language='', additional='', pred_name='translation')` +```python +MallFrame.translate(col, language='', additional='', pred_name='translation') +``` Translate text into another language. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|----------------------------------------------------------------------------------------|-----------------| -| `col` | str | The name of the text field to process | _required_ | -| `language` | str | The target language to translate to. For example 'French'. | `''` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'translation'` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|----------------------------------------------------------------------------------------|-----------------| +| col | str | The name of the text field to process | _required_ | +| language | str | The target language to translate to. For example 'French'. | `''` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'translation'` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} reviews.llm.translate("review", "spanish") @@ -234,21 +259,23 @@ reviews.llm.translate("review", "french") ### use { #mall.MallFrame.use } -`MallFrame.use(backend='', model='', _cache='_mall_cache', **kwargs)` +```python +MallFrame.use(backend='', model='', _cache='_mall_cache', **kwargs) +``` Define the model, backend, and other options to use to interact with the LLM. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| -| `backend` | str \| Chat \| Client | The name of the backend to use, or an Ollama Client object, or a `chatlas` Chat object. At the beginning of the session it defaults to "ollama". If passing `""`, it will remain unchanged | `''` | -| `model` | str | The name of the model tha the backend should use. At the beginning of the session it defaults to "llama3.2". If passing `""`, it will remain unchanged | `''` | -| `_cache` | str | The path of where to save the cached results. Passing `""` disables the cache | `'_mall_cache'` | -| `**kwargs` | | Arguments to pass to the downstream Python call. In this case, the `chat` function in `ollama` | `{}` | +| Name | Type | Description | Default | +|----------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| backend | str \| Chat \| Client | The name of the backend to use, or an Ollama Client object, or a `chatlas` Chat object. At the beginning of the session it defaults to "ollama". If passing `""`, it will remain unchanged | `''` | +| model | str | The name of the model tha the backend should use. At the beginning of the session it defaults to "llama3.2". If passing `""`, it will remain unchanged | `''` | +| _cache | str | The path of where to save the cached results. Passing `""` disables the cache | `'_mall_cache'` | +| **kwargs | | Arguments to pass to the downstream Python call. In this case, the `chat` function in `ollama` | `{}` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} # Additional arguments will be passed 'as-is' to the @@ -282,21 +309,23 @@ reviews.llm.use(chat) ### verify { #mall.MallFrame.verify } -`MallFrame.verify(col, what='', yes_no=[1, 0], additional='', pred_name='verify')` +```python +MallFrame.verify(col, what='', yes_no=[1, 0], additional='', pred_name='verify') +``` Check to see if something is true about the text. -#### Parameters +#### Parameters {.doc-section .doc-section-parameters} -| Name | Type | Description | Default | -|--------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| -| `col` | str | The name of the text field to process | _required_ | -| `what` | str | The statement or question that needs to be verified against the provided text | `''` | -| `yes_no` | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` | -| `pred_name` | str | A character vector with the name of the new column where the prediction will be placed | `'verify'` | -| `additional` | str | Inserts this text into the prompt sent to the LLM | `''` | +| Name | Type | Description | Default | +|------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| col | str | The name of the text field to process | _required_ | +| what | str | The statement or question that needs to be verified against the provided text | `''` | +| yes_no | list | A positional list of size 2, which contains the values to return if true and false. The first position will be used as the 'true' value, and the second as the 'false' value | `[1, 0]` | +| pred_name | str | A character vector with the name of the new column where the prediction will be placed | `'verify'` | +| additional | str | Inserts this text into the prompt sent to the LLM | `''` | -#### Examples +#### Examples {.doc-section .doc-section-examples} ```{python} reviews.llm.verify("review", "is the customer happy") diff --git a/reference/_api_index.qmd b/reference/_api_index.qmd index a543276..b5e7f51 100644 --- a/reference/_api_index.qmd +++ b/reference/_api_index.qmd @@ -14,4 +14,4 @@ | | | | --- | --- | -| [LlmVec](LlmVec.qmd#mall.LlmVec) | Class that adds ability to use an LLM to run batch predictions | \ No newline at end of file +| [LLMVec](LLMVec.qmd#mall.LLMVec) | Class that adds ability to use an LLM to run batch predictions | \ No newline at end of file diff --git a/reference/index.qmd b/reference/index.qmd index f7bc2be..670db0b 100644 --- a/reference/index.qmd +++ b/reference/index.qmd @@ -26,20 +26,20 @@ an LLM to run batch predictions over a data frame

-### LlmVec +### LLMVec        Class that adds ability to use an LLM to run batch predictions | Name | Description | | --- | --- | -| [classify](LlmVec.qmd#mall.LlmVec.classify) | Classify text into specific categories. | -| [custom](LlmVec.qmd#mall.LlmVec.custom) | Provide the full prompt that the LLM will process. | -| [extract](LlmVec.qmd#mall.LlmVec.extract) | Pull a specific label from the text. | -| [sentiment](LlmVec.qmd#mall.LlmVec.sentiment) | Use an LLM to run a sentiment analysis | -| [summarize](LlmVec.qmd#mall.LlmVec.summarize) | Summarize the text down to a specific number of words. | -| [translate](LlmVec.qmd#mall.LlmVec.translate) | Translate text into another language. | -| [verify](LlmVec.qmd#mall.LlmVec.verify) | Check to see if something is true about the text. | +| [classify](LLMVec.qmd#mall.LLMVec.classify) | Classify text into specific categories. | +| [custom](LLMVec.qmd#mall.LLMVec.custom) | Provide the full prompt that the LLM will process. | +| [extract](LLMVec.qmd#mall.LLMVec.extract) | Pull a specific label from the text. | +| [sentiment](LLMVec.qmd#mall.LLMVec.sentiment) | Use an LLM to run a sentiment analysis | +| [summarize](LLMVec.qmd#mall.LLMVec.summarize) | Summarize the text down to a specific number of words. | +| [translate](LLMVec.qmd#mall.LLMVec.translate) | Translate text into another language. | +| [verify](LLMVec.qmd#mall.LLMVec.verify) | Check to see if something is true about the text. | ::: From baafd5b5f9de38d54933647eb828ed4ba416fa0c Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 9 Jun 2025 08:49:50 -0500 Subject: [PATCH 14/23] Adds vec tests --- python/mall/llmvec.py | 2 +- python/tests/test_classify.py | 6 ++++++ python/tests/test_custom.py | 6 ++++++ python/tests/test_extract.py | 6 ++++++ python/tests/test_sentiment.py | 6 ++++++ python/tests/test_summarize.py | 6 ++++++ python/tests/test_translate.py | 6 ++++++ 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/python/mall/llmvec.py b/python/mall/llmvec.py index 2b25b9b..04ec047 100644 --- a/python/mall/llmvec.py +++ b/python/mall/llmvec.py @@ -179,7 +179,7 @@ def custom(self, x, prompt="", valid_resps="") -> list: The prompt to send to the LLM along with the `col` """ - return llm_loop(x=x, msg=custom(prompt), use=self._use, valid_resps=labels) + return llm_loop(x=x, msg=custom(prompt), use=self._use, valid_resps=valid_resps) def verify(self, x, what="", yes_no=[1, 0], additional="") -> list: """Check to see if something is true about the text. diff --git a/python/tests/test_classify.py b/python/tests/test_classify.py index ca39450..7b0f957 100644 --- a/python/tests/test_classify.py +++ b/python/tests/test_classify.py @@ -26,3 +26,9 @@ def pull(df, col): for i in df.select(col).to_dicts(): out.append(i.get(col)) return out + +def test_classify_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.classify(["a"], ["a", "b"]) + assert x == ['a'] diff --git a/python/tests/test_custom.py b/python/tests/test_custom.py index 515a744..efca829 100644 --- a/python/tests/test_custom.py +++ b/python/tests/test_custom.py @@ -11,3 +11,9 @@ def test_custom_prompt(): x = df.llm.custom("x", "hello") assert x["custom"][0] == "hello: \n{}" shutil.rmtree("_test_cache", ignore_errors=True) + +def test_custom_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.custom(["a"], "hello") + assert x == ['a'] diff --git a/python/tests/test_extract.py b/python/tests/test_extract.py index 9d675bf..9db3f86 100644 --- a/python/tests/test_extract.py +++ b/python/tests/test_extract.py @@ -44,3 +44,9 @@ def test_extract_expand(): x = df.llm.extract("x", ["a", "b"], expand_cols=True) assert x["a"][0] == "x " shutil.rmtree("_test_cache", ignore_errors=True) + +def test_extract_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.extract(["a"], ["a"]) + assert x == ['a'] diff --git a/python/tests/test_sentiment.py b/python/tests/test_sentiment.py index 22911f7..1982acf 100644 --- a/python/tests/test_sentiment.py +++ b/python/tests/test_sentiment.py @@ -48,3 +48,9 @@ def pull(df, col): for i in df.select(col).to_dicts(): out.append(i.get(col)) return out + +def test_sentiment_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.sentiment(["a"], ["a"]) + assert x == ['a'] diff --git a/python/tests/test_summarize.py b/python/tests/test_summarize.py index 0542283..5efc3c4 100644 --- a/python/tests/test_summarize.py +++ b/python/tests/test_summarize.py @@ -25,3 +25,9 @@ def test_summarize_max(): == "You are a helpful summarization engine. Your answer will contain no no capitalization and no explanations. Return no more than 5 words. The answer is the summary of the following text:\n{}" ) shutil.rmtree("_test_cache", ignore_errors=True) + +def test_summarize_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.summarize(["a"]) + assert x == ['a'] diff --git a/python/tests/test_translate.py b/python/tests/test_translate.py index e6c85c4..88a251c 100644 --- a/python/tests/test_translate.py +++ b/python/tests/test_translate.py @@ -14,3 +14,9 @@ def test_translate_prompt(): == "You are a helpful translation engine. You will return only the translation text, no explanations. The target language to translate to is: spanish. The answer is the translation of the following text:\n{}" ) shutil.rmtree("_test_cache", ignore_errors=True) + +def test_translate_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.translate(["a"], "spanish") + assert x == ['a'] From 30731258b73a5f4903bdf92a1397dae7b59c9cdb Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 9 Jun 2025 09:02:27 -0500 Subject: [PATCH 15/23] Adds verify vec test --- python/tests/test_verify.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/tests/test_verify.py b/python/tests/test_verify.py index f6df6fe..9a47cb1 100644 --- a/python/tests/test_verify.py +++ b/python/tests/test_verify.py @@ -26,3 +26,10 @@ def pull(df, col): for i in df.select(col).to_dicts(): out.append(i.get(col)) return out + +def test_verify_vec(): + from mall import LLMVec + llm = LLMVec("test", "echo") + x = llm.verify(["a"], "this is the verify test", ["a", "b"]) + assert x == ['a'] + From 68bc977c592f86af3d56a553bb958e1c067442c8 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 9 Jun 2025 14:16:05 -0500 Subject: [PATCH 16/23] Adds test for ollama Client object --- python/tests/test-llm.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/tests/test-llm.py b/python/tests/test-llm.py index b5d3fef..4f90832 100644 --- a/python/tests/test-llm.py +++ b/python/tests/test-llm.py @@ -14,3 +14,11 @@ def mock_chat(model, messages, options): df.llm.use("ollama", "llama3.2", _cache="") x = df.llm.summarize("x") assert x["summary"][0] == "test" + +def test_ollama_client(): + from ollama import Client + client = Client() + df = pl.DataFrame(dict(x="x")) + df.llm.use(client, _cache="") + use = df.llm._use + assert use.get("backend") == "ollama-client" From 9af39457fd0caa347728e19395afca044fff6012 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Mon, 14 Jul 2025 11:55:52 -0500 Subject: [PATCH 17/23] Renames test file --- python/tests/{test-llm.py => test_llm.py} | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) rename python/tests/{test-llm.py => test_llm.py} (66%) diff --git a/python/tests/test-llm.py b/python/tests/test_llm.py similarity index 66% rename from python/tests/test-llm.py rename to python/tests/test_llm.py index 4f90832..bb5f4ed 100644 --- a/python/tests/test-llm.py +++ b/python/tests/test_llm.py @@ -4,7 +4,6 @@ import shutil import os - def test_ollama(monkeypatch): def mock_chat(model, messages, options): return dict(message=dict(content="test")) @@ -15,10 +14,14 @@ def mock_chat(model, messages, options): x = df.llm.summarize("x") assert x["summary"][0] == "test" -def test_ollama_client(): +def test_ollama_client(monkeypatch): from ollama import Client client = Client() + def mock_chat(model, messages, options): + return dict(message=dict(content="test")) + monkeypatch.setattr("ollama.chat", mock_chat) df = pl.DataFrame(dict(x="x")) df.llm.use(client, _cache="") - use = df.llm._use - assert use.get("backend") == "ollama-client" + x = df.llm.summarize("x") + assert x["summary"][0] == "test" + \ No newline at end of file From e8787c09b90b96757885ff2687f92bcc8a6e6024 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 25 Jul 2025 09:55:55 -0500 Subject: [PATCH 18/23] Support for new ellmer --- r/R/m-backend-submit.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/r/R/m-backend-submit.R b/r/R/m-backend-submit.R index 4d5bd5d..b5e4ad7 100644 --- a/r/R/m-backend-submit.R +++ b/r/R/m-backend-submit.R @@ -91,9 +91,17 @@ m_ollama_tokens <- function() { m_backend_submit.mall_ellmer <- function(backend, x, prompt, preview = FALSE) { if (preview) { x <- head(x, 1) - map_here <- map + return(res) } else { - map_here <- map_chr + #map_here <- map_chr + defaults <- m_defaults_args() + ellmer_obj <- defaults[["ellmer_obj"]] + prompt <- prompt[[1]][["content"]] + prompt <- gsub("\\{", "\\{\\{", prompt) + prompt <- gsub("\\}", "\\}\\}", prompt) + prompts <- ellmer::interpolate(prompt, x = x) + res <- ellmer::parallel_chat_text(ellmer_obj, prompts) + return(res) } map_here( x, From ca616e15ba9c1695652026f062ba2efa11338748 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 25 Jul 2025 10:22:30 -0500 Subject: [PATCH 19/23] Removes test --- python/tests/test_llm.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/python/tests/test_llm.py b/python/tests/test_llm.py index bb5f4ed..b5d3fef 100644 --- a/python/tests/test_llm.py +++ b/python/tests/test_llm.py @@ -4,6 +4,7 @@ import shutil import os + def test_ollama(monkeypatch): def mock_chat(model, messages, options): return dict(message=dict(content="test")) @@ -13,15 +14,3 @@ def mock_chat(model, messages, options): df.llm.use("ollama", "llama3.2", _cache="") x = df.llm.summarize("x") assert x["summary"][0] == "test" - -def test_ollama_client(monkeypatch): - from ollama import Client - client = Client() - def mock_chat(model, messages, options): - return dict(message=dict(content="test")) - monkeypatch.setattr("ollama.chat", mock_chat) - df = pl.DataFrame(dict(x="x")) - df.llm.use(client, _cache="") - x = df.llm.summarize("x") - assert x["summary"][0] == "test" - \ No newline at end of file From 28c18e1eac6f176e9bba85c326cc2e6b10edeb9e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 25 Jul 2025 11:37:34 -0500 Subject: [PATCH 20/23] Backing out support for new ellmer version --- r/R/m-backend-submit.R | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/r/R/m-backend-submit.R b/r/R/m-backend-submit.R index b5e4ad7..27c04c2 100644 --- a/r/R/m-backend-submit.R +++ b/r/R/m-backend-submit.R @@ -32,17 +32,17 @@ m_backend_submit.mall_ollama <- function(backend, x, prompt, preview = FALSE) { .args <- c( messages = list( map(prompt, \(i) - map(i, \(j) { - out <- glue(j, x = x) - ln <- length(unlist(strsplit(out, " "))) - if (ln > m_ollama_tokens()) { - warnings <<- c( - warnings, - list(list(row = substr(x, 1, 20), len = ln)) - ) - } - out - })) + map(i, \(j) { + out <- glue(j, x = x) + ln <- length(unlist(strsplit(out, " "))) + if (ln > m_ollama_tokens()) { + warnings <<- c( + warnings, + list(list(row = substr(x, 1, 20), len = ln)) + ) + } + out + })) ), output = "text", m_defaults_args(backend) @@ -91,17 +91,9 @@ m_ollama_tokens <- function() { m_backend_submit.mall_ellmer <- function(backend, x, prompt, preview = FALSE) { if (preview) { x <- head(x, 1) - return(res) + map_here <- map } else { - #map_here <- map_chr - defaults <- m_defaults_args() - ellmer_obj <- defaults[["ellmer_obj"]] - prompt <- prompt[[1]][["content"]] - prompt <- gsub("\\{", "\\{\\{", prompt) - prompt <- gsub("\\}", "\\}\\}", prompt) - prompts <- ellmer::interpolate(prompt, x = x) - res <- ellmer::parallel_chat_text(ellmer_obj, prompts) - return(res) + map_here <- map_chr } map_here( x, From f0931417ca29de90bc529f2dbe3868e5aca6f883 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 25 Jul 2025 13:11:55 -0500 Subject: [PATCH 21/23] Avoids full import of ellmer --- r/NAMESPACE | 2 +- r/R/m-backend-submit.R | 26 +++++++++++++++----------- r/R/mall.R | 4 ++-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 6b0e63f..3903ec9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -34,7 +34,6 @@ export(llm_verify) export(m_backend_prompt) export(m_backend_submit) import(cli) -import(ellmer) import(fs) import(glue) import(rlang) @@ -43,6 +42,7 @@ importFrom(dplyr,mutate) importFrom(dplyr,pull) importFrom(dplyr,sql) importFrom(dplyr,tibble) +importFrom(ellmer,parallel_chat_text) importFrom(jsonlite,fromJSON) importFrom(jsonlite,read_json) importFrom(jsonlite,write_json) diff --git a/r/R/m-backend-submit.R b/r/R/m-backend-submit.R index 27c04c2..83d67de 100644 --- a/r/R/m-backend-submit.R +++ b/r/R/m-backend-submit.R @@ -32,17 +32,17 @@ m_backend_submit.mall_ollama <- function(backend, x, prompt, preview = FALSE) { .args <- c( messages = list( map(prompt, \(i) - map(i, \(j) { - out <- glue(j, x = x) - ln <- length(unlist(strsplit(out, " "))) - if (ln > m_ollama_tokens()) { - warnings <<- c( - warnings, - list(list(row = substr(x, 1, 20), len = ln)) - ) - } - out - })) + map(i, \(j) { + out <- glue(j, x = x) + ln <- length(unlist(strsplit(out, " "))) + if (ln > m_ollama_tokens()) { + warnings <<- c( + warnings, + list(list(row = substr(x, 1, 20), len = ln)) + ) + } + out + })) ), output = "text", m_defaults_args(backend) @@ -127,6 +127,10 @@ m_ellmer_chat <- function(...) { temp_ellmer$chat(...) } +dummy_func <- function(x, y) { + parallel_chat_text(x, y) +} + # ------------------------------ Simulate -------------------------------------- #' @export diff --git a/r/R/mall.R b/r/R/mall.R index befecfe..6c1168a 100644 --- a/r/R/mall.R +++ b/r/R/mall.R @@ -1,8 +1,8 @@ #' @importFrom ollamar chat test_connection list_models #' @importFrom dplyr mutate tibble bind_cols pull sql -#' @importFrom utils menu head #' @importFrom jsonlite fromJSON read_json write_json -#' @import ellmer +#' @importFrom ellmer parallel_chat_text +#' @importFrom utils menu head #' @import rlang #' @import glue #' @import cli From d00bc7ec01abab49f5ed63f7c5c9a947e3163a01 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 25 Jul 2025 13:26:08 -0500 Subject: [PATCH 22/23] Adds min version for ellmer dependency --- r/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 8fc43f6..ec17329 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -17,7 +17,7 @@ RoxygenNote: 7.3.2 Imports: cli, dplyr, - ellmer, + ellmer (>=0.3.0), fs, glue, jsonlite, From 8bc5225fce48988e605ae0cfce441076eb183c35 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Fri, 25 Jul 2025 13:32:23 -0500 Subject: [PATCH 23/23] Fixes version number --- r/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index ec17329..30007d4 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -17,7 +17,7 @@ RoxygenNote: 7.3.2 Imports: cli, dplyr, - ellmer (>=0.3.0), + ellmer (>= 0.3.0), fs, glue, jsonlite,