From 2f3d42cf5fc6501631d4eba15b4b970ef94d3781 Mon Sep 17 00:00:00 2001 From: Jeffrey Ward Date: Fri, 5 Dec 2025 16:57:52 -0500 Subject: [PATCH] Initial. --- README.md | 2 + file_processor/.formatter.exs | 4 + file_processor/.gitignore | 12 + file_processor/README.md | 21 ++ file_processor/config/config.exs | 16 ++ file_processor/dep_pdf_extractor_mod/.hex | Bin 0 -> 269 bytes .../dep_pdf_extractor_mod/CHANGELOG.md | 79 ++++++ file_processor/dep_pdf_extractor_mod/LICENSE | 21 ++ .../dep_pdf_extractor_mod/README.md | 91 +++++++ .../dep_pdf_extractor_mod/hex_metadata.config | 22 ++ .../lib/pdf_extractor.ex | 228 +++++++++++++++++ .../lib/pdf_extractor/pdf_plumber.ex | 189 ++++++++++++++ file_processor/dep_pdf_extractor_mod/mix.exs | 90 +++++++ file_processor/lib/file_processor.ex | 88 +++++++ .../lib/file_processor/application.ex | 23 ++ file_processor/lib/file_processor/impl.ex | 230 ++++++++++++++++++ file_processor/lib/file_processor/structs.ex | 7 + file_processor/lib/results_server.ex | 89 +++++++ file_processor/mix.exs | 31 +++ file_processor/mix.lock | 11 + file_processor/start.txt | 1 + file_processor/test/file_processor_test.exs | 8 + file_processor/test/test_helper.exs | 1 + 23 files changed, 1264 insertions(+) create mode 100644 README.md create mode 100644 file_processor/.formatter.exs create mode 100644 file_processor/.gitignore create mode 100644 file_processor/README.md create mode 100644 file_processor/config/config.exs create mode 100644 file_processor/dep_pdf_extractor_mod/.hex create mode 100644 file_processor/dep_pdf_extractor_mod/CHANGELOG.md create mode 100644 file_processor/dep_pdf_extractor_mod/LICENSE create mode 100644 file_processor/dep_pdf_extractor_mod/README.md create mode 100644 file_processor/dep_pdf_extractor_mod/hex_metadata.config create mode 100644 file_processor/dep_pdf_extractor_mod/lib/pdf_extractor.ex create mode 100644 file_processor/dep_pdf_extractor_mod/lib/pdf_extractor/pdf_plumber.ex create mode 100644 file_processor/dep_pdf_extractor_mod/mix.exs create mode 100644 file_processor/lib/file_processor.ex create mode 100644 file_processor/lib/file_processor/application.ex create mode 100644 file_processor/lib/file_processor/impl.ex create mode 100644 file_processor/lib/file_processor/structs.ex create mode 100644 file_processor/lib/results_server.ex create mode 100644 file_processor/mix.exs create mode 100644 file_processor/mix.lock create mode 100644 file_processor/start.txt create mode 100644 file_processor/test/file_processor_test.exs create mode 100644 file_processor/test/test_helper.exs diff --git a/README.md b/README.md new file mode 100644 index 0000000..c96abcf --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# FileProcessor + diff --git a/file_processor/.formatter.exs b/file_processor/.formatter.exs new file mode 100644 index 0000000..d2cda26 --- /dev/null +++ b/file_processor/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/file_processor/.gitignore b/file_processor/.gitignore new file mode 100644 index 0000000..fc5d8e3 --- /dev/null +++ b/file_processor/.gitignore @@ -0,0 +1,12 @@ +# ---> Elixir +/_build +/cover +/deps +/doc +/.fetch +erl_crash.dump +*.ez +*.beam +/config/*.secret.exs +.elixir_ls/ + diff --git a/file_processor/README.md b/file_processor/README.md new file mode 100644 index 0000000..8820a30 --- /dev/null +++ b/file_processor/README.md @@ -0,0 +1,21 @@ +# FileProcessor + +**TODO: Add description** + +## Installation + +If [available in Hex](https://hex.pm/docs/publish), the package can be installed +by adding `file_processor` to your list of dependencies in `mix.exs`: + +```elixir +def deps do + [ + {:file_processor, "~> 0.1.0"} + ] +end +``` + +Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc) +and published on [HexDocs](https://hexdocs.pm). Once published, the docs can +be found at . + diff --git a/file_processor/config/config.exs b/file_processor/config/config.exs new file mode 100644 index 0000000..c880d87 --- /dev/null +++ b/file_processor/config/config.exs @@ -0,0 +1,16 @@ +import Config + +config :file_processor, + filepath: "/tmp/myfiles", + rfppath: "/tmp/rfpfile" + +config :file_processor, FileProcessor.Application, + pubsub: [ + phoenix_pubsub: [ + adapter: Phoenix.PubSub.PG2, + pool_size: 1 + ] + ] + +# Only need this when we have different envs +# import_config "#{config_env()}.exs" diff --git a/file_processor/dep_pdf_extractor_mod/.hex b/file_processor/dep_pdf_extractor_mod/.hex new file mode 100644 index 0000000000000000000000000000000000000000..a646181e6429c973ca2ec8029d4d6201efeb7d09 GIT binary patch literal 269 zcmZ9^OH#uy5Cl*YhtGx_B_)l%v*RYIu{6R4B)c33dC$dBShDM`e$_wYVeD%^@=gcp zwyx`*>akGf+~VsJ(}nkKrLfIweg5Y4I?d%iJ&LnAi+Wsnng4owSaN;IROr32uVx-< zpQrt!o~BZGy@Zj&=k=EF{Edk)NXe)yDeb(Wl#EBqQBfcfrNAM890V{0*PO9rnsZ=D yW8`|8Z`=P 0.4.0 for Python integration +- Requires Python with pdfplumber package installed + +[Unreleased]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.3.0...HEAD +[0.3.0]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.2.1...v0.3.0 +[0.2.1]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.2.0...v0.2.1 +[0.2.0]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.1.0...v0.2.0 +[0.1.0]: https://github.com/YOUR_USERNAME/pdf_extractor/releases/tag/v0.1.0 diff --git a/file_processor/dep_pdf_extractor_mod/LICENSE b/file_processor/dep_pdf_extractor_mod/LICENSE new file mode 100644 index 0000000..e4664b5 --- /dev/null +++ b/file_processor/dep_pdf_extractor_mod/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Nelson Estevão + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/file_processor/dep_pdf_extractor_mod/README.md b/file_processor/dep_pdf_extractor_mod/README.md new file mode 100644 index 0000000..5444e16 --- /dev/null +++ b/file_processor/dep_pdf_extractor_mod/README.md @@ -0,0 +1,91 @@ +# PdfExtractor + +[![Release](https://img.shields.io/hexpm/v/pdf_extractor.svg)](https://hex.pm/packages/pdf_extractor) +[![Documentation](https://img.shields.io/badge/docs-hexpm-blue.svg)](https://hexdocs.pm/pdf_extractor) +[![Downloads](https://img.shields.io/hexpm/dt/pdf_extractor.svg)](https://hex.pm/packages/pdf_extractor) +[![License](https://img.shields.io/hexpm/l/pdf_extractor.svg)](https://hex.pm/packages/pdf_extractor) +[![Last Commit](https://img.shields.io/github/last-commit/nelsonmestevao/pdf_extractor.svg)](https://github.com/nelsonmestevao/pdf_extractor) + + +A powerful and easy-to-use Elixir library for extracting text and metadata from PDF files. + +PdfExtractor leverages Python's `pdfplumber` library through seamless integration to provide +robust PDF text extraction capabilities. It supports both file-based and binary-based operations, +making it suitable for various use cases from local file processing to web-based PDF handling. + +## Features + +- 🔍 Extract text from single or multiple PDF pages +- 📍 Area-based extraction using bounding boxes +- 🌐 Work with PDF data directly from memory (e.g., HTTP downloads) +- 📊 Get PDF metadata like title, author, creation date +- 🐍 Leverages Python's powerful `pdfplumber` library +- 🚀 Simple and intuitive API +- ✅ Comprehensive test coverage +- 📚 Full documentation + +## Installation + +Add `pdf_extractor` to your list of dependencies in `mix.exs`: + +```elixir +def deps do + [ + {:pdf_extractor, "~> 0.5.0"} + ] +end +``` + +Then start it in your application start function: + +```elixir +defmodule MyApp.Application do + use Application + + def start(_type, _args) do + children = [ + PdfExtractor, + ... + ] + + opts = [strategy: :one_for_one, name: MyApp.Supervisor] + Supervisor.start_link(children, opts) + end +end +``` + +## Usage + +Extract text from specific regions using bounding boxes `{x0, y0, x1, y1}`: + +```elixir +areas = %{ + 0 => {0, 0, 300, 200}, # Top-left area of page 0 + 1 => [ + {200, 300, 600, 500}, # Bottom-right area of page 1 + {0, 0, 200, 250}, # Top-left area of page 1 + ] +} +PdfExtractor.extract_text("path/to/document.pdf", areas) +``` + +### Return Format + +The function returns a map where keys are page numbers and values are the extracted text: + +```elixir +%{ + 0 => "Text from page 0...", + 1 => ["Text from page 1 (first area)...", "Text from page 1 (second area)..."], + 2 => "Text from page 2..." +} +``` + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + +## Acknowledgments + +- Built on top of the excellent [pdfplumber](https://github.com/jsvine/pdfplumber) Python library +- Uses [pythonx](https://github.com/livebook-dev/pythonx) for seamless Python integration diff --git a/file_processor/dep_pdf_extractor_mod/hex_metadata.config b/file_processor/dep_pdf_extractor_mod/hex_metadata.config new file mode 100644 index 0000000..35a2332 --- /dev/null +++ b/file_processor/dep_pdf_extractor_mod/hex_metadata.config @@ -0,0 +1,22 @@ +{<<"links">>, + [{<<"Changelog">>, + <<"https://github.com/nelsonmestevao/pdf_extractor/blob/main/CHANGELOG.md">>}, + {<<"GitHub">>,<<"https://github.com/nelsonmestevao/pdf_extractor">>}]}. +{<<"name">>,<<"pdf_extractor">>}. +{<<"version">>,<<"0.5.0">>}. +{<<"description">>, + <<"A lightweight Elixir library for extracting text from PDF files using Python's pdfplumber.\nSupports single and multi-page extraction with optional area filtering.">>}. +{<<"elixir">>,<<"~> 1.15">>}. +{<<"app">>,<<"pdf_extractor">>}. +{<<"files">>, + [<<"lib">>,<<"lib/pdf_extractor">>,<<"lib/pdf_extractor/pdf_plumber.ex">>, + <<"lib/pdf_extractor.ex">>,<<"mix.exs">>,<<"README.md">>,<<"LICENSE">>, + <<"CHANGELOG.md">>]}. +{<<"licenses">>,[<<"MIT">>]}. +{<<"requirements">>, + [[{<<"name">>,<<"pythonx">>}, + {<<"app">>,<<"pythonx">>}, + {<<"optional">>,false}, + {<<"requirement">>,<<"~> 0.4.4">>}, + {<<"repository">>,<<"hexpm">>}]]}. +{<<"build_tools">>,[<<"mix">>]}. diff --git a/file_processor/dep_pdf_extractor_mod/lib/pdf_extractor.ex b/file_processor/dep_pdf_extractor_mod/lib/pdf_extractor.ex new file mode 100644 index 0000000..17a9b19 --- /dev/null +++ b/file_processor/dep_pdf_extractor_mod/lib/pdf_extractor.ex @@ -0,0 +1,228 @@ +defmodule PdfExtractor do + @moduledoc "README.md" + |> File.read!() + |> String.split("\n\n") + |> tl() + |> tl() + |> Enum.join("\n\n") + use GenServer + + @external_resource "README.md" + + # Client + + def start_link(opts \\ []) do + opts = Keyword.validate!(opts, name: __MODULE__) + GenServer.start_link(__MODULE__, [], name: opts[:name]) + end + + @doc ~S""" + Extracts text from PDF pages. + + It supports extracting from single pages, multiple pages, and specific areas within pages. + + ## Page Numbers + + - **Integer**: Extract from single page (e.g., `0` for first page) + - **List**: Extract from multiple pages (e.g., `[0, 1, 2]`) + - **Empty list** `[]`: Extract from all pages (default) + + ## Areas Format + + Areas are specified as a map where keys are page numbers and values are bounding boxes: + + - **Single area**: `%{0 => {x0, y0, x1, y1}}` + - **Multiple areas**: `%{0 => [{x0, y0, x1, y1}, {x2, y2, x3, y3}]}` + - **Mixed**: `%{0 => {x0, y0, x1, y1}, 1 => [{x2, y2, x3, y3}, {x4, y4, x5, y5}]}` + + ## Examples + + Extract text from all pages. + + iex> PdfExtractor.extract_text("priv/fixtures/fatura.pdf") + {:ok, + %{ + 0 => + "Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €", + 1 => + "✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point" + }} + + Extract text from only some pages. + + iex> PdfExtractor.extract_text("priv/fixtures/fatura.pdf", [0]) + {:ok, + %{ + 0 => + "Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €" + }} + + Extract only the titles in the book chapters. + + iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{ + ...> 2 => {0, 0, 612, 190}, + ...> 8 => {0, 0, 612, 190}, + ...> 10 => {0, 0, 612, 190} + ...> }) + {:ok, + %{ + 2 => "Introdução – Nota do tradutor", + 8 => "I. Sobre aproveitar o tempo", + 10 => "II. Sobre a falta de foco na Leitura" + }} + + Extract multiple areas from a single page. + + iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{ + ...> 1 => [{0, 100, 612, 140}, {0, 400, 612, 440}] + ...> }) + {:ok, + %{ + 1 => [ + "CARTAS DE UM ESTOICO, Volume I", + "Montecristo Editora Ltda.\ne-mail: editora@montecristoeditora.com.br" + ] + }} + """ + def extract_text(file_path, pages \\ []) do + GenServer.call(__MODULE__, {:extract_text, [file_path, pages]}) + end + + def extract_text_timeout(file_path, pages \\ [], timeout) do + GenServer.call(__MODULE__, {:extract_text, [file_path, pages]}, timeout) + end + + @doc ~S""" + Extracts text from PDF binary data. See `extract_text/3` for details on how to specify pages and areas. + + This function allows you to extract text from PDF data that's already in memory, + such as data downloaded from a URL or received via an API. This avoids the need + to write the PDF to the filesystem. + + ## Examples + + Extract text from all pages. + + iex> content = File.read!("priv/fixtures/fatura.pdf") + ...> PdfExtractor.extract_text_from_binary(content) + {:ok, + %{ + 0 => + "Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €", + 1 => + "✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point" + }} + + Extract text from only some pages. + + iex> content = File.read!("priv/fixtures/fatura.pdf") + ...> PdfExtractor.extract_text_from_binary(content, [0]) + {:ok, + %{ + 0 => + "Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €" + }} + + Extract only the titles in the book chapters. + + iex> content = File.read!("priv/fixtures/book.pdf") + ...> + ...> PdfExtractor.extract_text_from_binary(content, %{ + ...> 2 => {0, 0, 612, 190}, + ...> 8 => {0, 0, 612, 190}, + ...> 10 => {0, 0, 612, 190} + ...> }) + {:ok, + %{ + 2 => "Introdução – Nota do tradutor", + 8 => "I. Sobre aproveitar o tempo", + 10 => "II. Sobre a falta de foco na Leitura" + }} + + Extract multiple areas from a single page. + + iex> content = File.read!("priv/fixtures/book.pdf") + ...> + ...> PdfExtractor.extract_text_from_binary(content, %{ + ...> 1 => [{0, 100, 612, 140}, {0, 400, 612, 440}] + ...> }) + {:ok, + %{ + 1 => [ + "CARTAS DE UM ESTOICO, Volume I", + "Montecristo Editora Ltda.\ne-mail: editora@montecristoeditora.com.br" + ] + }} + + """ + def extract_text_from_binary(binary, pages \\ []) do + GenServer.call(__MODULE__, {:extract_text_from_binary, [binary, pages]}) + end + + @doc """ + Extracts metadata from a PDF file info trailers. Typically includes "CreationDate", "ModDate", "Producer", et cetera. + + ## Examples + + iex> PdfExtractor.extract_metadata("priv/fixtures/book.pdf") + {:ok, + %{ + "CreationDate" => "D:20250718212328Z", + "Creator" => "Stirling-PDF v0.44.2", + "ModDate" => "D:20250718212328Z", + "Producer" => "Stirling-PDF v0.44.2" + }} + + """ + def extract_metadata(file_path) do + GenServer.call(__MODULE__, {:extract_metadata, [file_path]}) + end + + @doc """ + Extracts metadata from PDF binary data. Similar to `extract_metadata/1` but works with PDF data in memory instead of + files. + + ## Examples + + iex> content = File.read!("priv/fixtures/book.pdf") + ...> PdfExtractor.extract_metadata_from_binary(content) + {:ok, + %{ + "CreationDate" => "D:20250718212328Z", + "Creator" => "Stirling-PDF v0.44.2", + "ModDate" => "D:20250718212328Z", + "Producer" => "Stirling-PDF v0.44.2" + }} + + """ + def extract_metadata_from_binary(binary) do + GenServer.call(__MODULE__, {:extract_metadata_from_binary, [binary]}) + end + + # Server + + @doc false + @impl true + def init([] = state) do + try do + :ok = PdfExtractor.PdfPlumber.start() + rescue + e in RuntimeError -> + if e.message =~ ~r/Python interpreter has already been initialized/ do + :ok + else + reraise e, __STACKTRACE__ + end + end + + {:ok, state} + end + + @doc false + @impl true + def handle_call({function, args}, _from, state) when is_atom(function) and is_list(args) do + {:reply, {:ok, apply(PdfExtractor.PdfPlumber, function, args)}, state} + rescue + exception in Pythonx.Error -> {:reply, {:error, exception}, state} + end +end diff --git a/file_processor/dep_pdf_extractor_mod/lib/pdf_extractor/pdf_plumber.ex b/file_processor/dep_pdf_extractor_mod/lib/pdf_extractor/pdf_plumber.ex new file mode 100644 index 0000000..09577ee --- /dev/null +++ b/file_processor/dep_pdf_extractor_mod/lib/pdf_extractor/pdf_plumber.ex @@ -0,0 +1,189 @@ +defmodule PdfExtractor.PdfPlumber do + @moduledoc false + + def start do + Pythonx.uv_init(""" + [project] + name = "pdf_extractor" + version = "#{to_string(version())}" + requires-python = "==3.12.*" + dependencies = [ + "pdfplumber==0.11.7" + ] + """) + end + + @type area :: {non_neg_integer(), non_neg_integer(), non_neg_integer(), non_neg_integer()} + @type page :: non_neg_integer() + + @spec extract_text( + file_path :: String.t(), + pages :: page() | list(page()) | %{page() => area() | [area()] | nil} + ) :: %{page() => String.t() | list(String.t())} + def extract_text(file_path, page_number) when is_integer(page_number) do + extract_text(file_path, List.wrap(page_number)) + end + + def extract_text(file_path, pages) when is_list(pages) do + """ + #{python_extract_code()} + + main(file_path.decode('utf-8'), page_numbers, areas) + """ + |> Pythonx.eval(%{ + "file_path" => file_path, + "page_numbers" => pages, + "areas" => %{} + }) + |> elem(0) + |> Pythonx.decode() + |> to_map(pages) + end + + def extract_text(file_path, pages) when is_map(pages) do + """ + #{python_extract_code()} + + main(file_path.decode('utf-8'), page_numbers, areas) + """ + |> Pythonx.eval(%{ + "file_path" => file_path, + "page_numbers" => Map.keys(pages), + "areas" => pages + }) + |> elem(0) + |> Pythonx.decode() + |> to_map(Map.keys(pages)) + end + + @doc """ + This version avoids the need to put the pdf on a filesystem. + This allows this to work + url = "https://erlang.org/download/armstrong_thesis_2003.pdf" + url |> :httpc.request() |> elem(1) |> elem(2) |> :binary.list_to_bin() |> PdfExtractor.extract_text_from_binary() + """ + def extract_text_from_binary(binary, page_number) when is_integer(page_number) do + extract_text_from_binary(binary, List.wrap(page_number)) + end + + def extract_text_from_binary(binary, pages) when is_list(pages) do + """ + from io import BytesIO + + #{python_extract_code()} + + main(BytesIO(binary), page_numbers, areas) + """ + |> Pythonx.eval(%{ + "binary" => binary, + "page_numbers" => pages, + "areas" => %{} + }) + |> elem(0) + |> Pythonx.decode() + |> to_map(pages) + end + + def extract_text_from_binary(binary, pages) when is_map(pages) do + """ + from io import BytesIO + + #{python_extract_code()} + + main(BytesIO(binary), page_numbers, areas) + """ + |> Pythonx.eval(%{ + "binary" => binary, + "page_numbers" => Map.keys(pages), + "areas" => pages + }) + |> elem(0) + |> Pythonx.decode() + |> to_map(Map.keys(pages)) + end + + defp python_extract_code do + """ + import pdfplumber + import logging + + logging.getLogger("pdfminer").setLevel(logging.ERROR) + + def extract_from_page(page, areas=None): + if areas is None: + return page.extract_text() + elif isinstance(areas, list): + return [page.within_bbox(area).extract_text() for area in areas] + else: + return page.within_bbox(areas).extract_text() + + def main(content, page_numbers, areas): + results = [] + with pdfplumber.open(content) as pdf: + total_pages = len(pdf.pages) + if page_numbers == []: + page_numbers = list(range(total_pages)) + for page_number in page_numbers: + if page_number >= 0 and page_number < total_pages: + results.append(extract_from_page(pdf.pages[page_number], areas.get(page_number))) + return results + """ + end + + def extract_metadata(file_path) do + """ + #{python_extract_metadata_code()} + + main(file_path.decode('utf-8')) + """ + |> Pythonx.eval(%{ + "file_path" => file_path + }) + |> elem(0) + |> Pythonx.decode() + end + + def extract_metadata_from_binary(binary) do + """ + from io import BytesIO + + #{python_extract_metadata_code()} + + main(BytesIO(binary)) + """ + |> Pythonx.eval(%{ + "binary" => binary + }) + |> elem(0) + |> Pythonx.decode() + end + + defp python_extract_metadata_code do + """ + import pdfplumber + import logging + + logging.getLogger("pdfminer").setLevel(logging.ERROR) + + def main(content): + with pdfplumber.open(content) as pdf: + return pdf.metadata + """ + end + + defp to_map(texts, []) when is_list(texts) do + texts + |> Enum.with_index(&{&2, &1}) + |> Map.new() + end + + defp to_map(texts, page_numbers) when is_list(texts) do + page_numbers + |> Enum.zip(texts) + |> Map.new() + end + + defp version do + Application.spec(:pdf_extractor, :vsn) + end +end diff --git a/file_processor/dep_pdf_extractor_mod/mix.exs b/file_processor/dep_pdf_extractor_mod/mix.exs new file mode 100644 index 0000000..343d137 --- /dev/null +++ b/file_processor/dep_pdf_extractor_mod/mix.exs @@ -0,0 +1,90 @@ +defmodule PdfExtractor.MixProject do + use Mix.Project + + @app :pdf_extractor + @name "PdfExtractor" + @version "0.5.0" + @source_url "https://github.com/nelsonmestevao/pdf_extractor" + + def project do + [ + name: @name, + app: @app, + version: @version, + elixir: "~> 1.15", + start_permanent: Mix.env() == :prod, + deps: deps(), + description: description(), + package: package(), + docs: docs(), + aliases: aliases(), + dialyzer: dialyzer(), + source_url: @source_url + ] + end + + def application do + [ + extra_applications: [:logger] + ] + end + + defp deps do + [ + {:pythonx, "~> 0.4.4"}, + + # tools + {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, + {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false}, + {:doctest_formatter, "~> 0.4.0", only: [:dev, :test], runtime: false}, + {:ex_doc, "~> 0.38", only: :dev, runtime: false}, + {:styler, "~> 1.0", only: [:dev, :test], runtime: false} + ] + end + + defp aliases do + [ + "lint.dialyzer": ["dialyzer --format dialyxir"] + ] + end + + defp description do + """ + A lightweight Elixir library for extracting text from PDF files using Python's pdfplumber. + Supports single and multi-page extraction with optional area filtering. + """ + end + + defp package do + [ + name: @app, + files: ~w(lib mix.exs README.md LICENSE CHANGELOG*), + licenses: ["MIT"], + links: %{ + "GitHub" => @source_url, + "Changelog" => "#{@source_url}/blob/main/CHANGELOG.md" + }, + maintainers: ["Nelson Estevão "] + ] + end + + defp docs do + [ + main: "readme", + name: @name, + source_ref: "v#{@version}", + source_url: @source_url, + extras: ["README.md", "CHANGELOG.md", "LICENSE"] + ] + end + + defp dialyzer do + [ + flags: [:no_opaque], + list_unused_filters: true, + plt_add_deps: :apps_tree, + plt_add_apps: [:ex_unit, :iex, :mix, :credo_naming], + plt_file: {:no_warn, "priv/plts/elixir-#{System.version()}-erlang-otp-#{System.otp_release()}.plt"} + ] + end +end diff --git a/file_processor/lib/file_processor.ex b/file_processor/lib/file_processor.ex new file mode 100644 index 0000000..cf5a3df --- /dev/null +++ b/file_processor/lib/file_processor.ex @@ -0,0 +1,88 @@ +defmodule FileProcessor do + @moduledoc """ + Documentation for `FileProcessor`. + """ + + use GenServer + + # Client + + @server FileProcessor.Server + + def start_link(initial_values) do + GenServer.start_link(__MODULE__, initial_values, name: @server) + end + + def put_file({filename, contents}) do + GenServer.call(@server, {:put_file, filename, contents}) + end + + def get_files() do + GenServer.call(@server, :get_files) + end + + def delete_file(filename) do + GenServer.call(@server, {:delete_file, filename}) + end + + def put_rfp({filename, contents}) do + GenServer.call(@server, {:put_rfp, filename, contents}) + end + + def get_rfp() do + GenServer.call(@server, :get_rfp) + end + + def delete_rfp(filename) do + GenServer.call(@server, {:delete_rfp, filename}) + end + + def process_files() do + GenServer.call(@server, :process_files) + end + + # Server + + alias FileProcessor.Impl + + @impl true + def init(initial_values) do + Impl.do_init() + {:ok, initial_values} + end + + @impl true + def handle_call({:put_file, {filename, file}}, _from, current_state) do + {:reply, Impl.put_file(filename, file), current_state} + end + + @impl true + def handle_call(:get_files, _from, current_state) do + {:reply, Impl.get_files(), current_state} + end + + @impl true + def handle_call({:delete_file, filename}, _from, current_state) do + {:reply, Impl.delete_file(filename), current_state} + end + + @impl true + def handle_call({:put_rfp, {filename, file}}, _from, current_state) do + {:reply, Impl.put_rfp(filename, file), current_state} + end + + @impl true + def handle_call(:get_rfp, _from, current_state) do + {:reply, Impl.get_rfp(), current_state} + end + + @impl true + def handle_call({:delete_rfp, filename}, _from, current_state) do + {:reply, Impl.delete_rfp(filename), current_state} + end + + @impl true + def handle_call(:process_files, _from, current_state) do + {:reply, Impl.process_files(), current_state} + end +end diff --git a/file_processor/lib/file_processor/application.ex b/file_processor/lib/file_processor/application.ex new file mode 100644 index 0000000..f3a6ca0 --- /dev/null +++ b/file_processor/lib/file_processor/application.ex @@ -0,0 +1,23 @@ +defmodule FileProcessor.Application do + # See https://hexdocs.pm/elixir/Application.html + # for more information on OTP Applications + @moduledoc false + + use Application + + @impl true + def start(_type, _args) do + children = [ + # Starts a worker by calling: FileProcessor.Worker.start_link(arg) + {FileProcessor, %{}}, + PdfExtractor, + {ResultsServer, %{:results=>%{}, :good=>[], :bad=>[]}}, + {Phoenix.PubSub, name: TrustedEdgeServer.PubSub} + ] + + # See https://hexdocs.pm/elixir/Supervisor.html + # for other strategies and supported options + opts = [strategy: :one_for_one, name: FileProcessor.Supervisor] + Supervisor.start_link(children, opts) + end +end diff --git a/file_processor/lib/file_processor/impl.ex b/file_processor/lib/file_processor/impl.ex new file mode 100644 index 0000000..5d28361 --- /dev/null +++ b/file_processor/lib/file_processor/impl.ex @@ -0,0 +1,230 @@ +defmodule FileProcessor.Impl do + require Logger + + @output_dir Application.fetch_env!(:file_processor, :filepath) + @rfp_dir Application.fetch_env!(:file_processor, :rfppath) + + @gpu_node :gpu@kittykat + + def do_init() do + Logger.debug("Using dir: #{@output_dir} ") + res = File.mkdir_p(@output_dir) + Logger.debug("Result = #{res}") + + Logger.debug("Using rfp dir: #{@rfp_dir} ") + res = File.mkdir_p(@rfp_dir) + Logger.debug("Result = #{res}") + end + + def put_file(filename, contents) do + Logger.debug("Using dir: #{@output_dir} ") + Logger.debug("Handling #{filename}...") + + full_path = Path.join(@output_dir, filename) + + case File.write(full_path, contents) do + :ok -> + # Return all files so liveview can update + get_files() + + {:error, reason} -> + {:ok, files} = get_files() + {:error, reason, files} + end + end + + def get_files() do + File.ls(@output_dir) + end + + def delete_file(filename) do + Logger.debug("Deleting #{filename}...") + full_path = Path.join(@output_dir, filename) + + case File.rm(full_path) do + :ok -> + get_files() + + {:error, reason} -> + {:ok, files} = get_files() + {:error, reason, files} + end + end + + def put_rfp(filename, contents) do + Logger.debug("Using dir: #{@rfp_dir} ") + Logger.debug("Handling #{filename}...") + + # Remove old one(s) + files_to_delete = Path.wildcard(Path.join(@rfp_dir, "*")) + Enum.each(files_to_delete, fn file -> File.rm(file) end) + + # Write new one + full_path = Path.join(@rfp_dir, filename) + + case File.write(full_path, contents) do + :ok -> + get_rfp() + + {:error, reason} -> + {:ok, rfps} = get_rfp() + {:error, reason, rfps} + end + end + + def get_rfp() do + File.ls(@rfp_dir) + end + + def delete_rfp(filename) do + Logger.debug("Deleting RFP #{filename}...") + + full_path = Path.join(@rfp_dir, filename) + + case File.rm(full_path) do + :ok -> + get_rfp() + + {:error, reason} -> + {:ok, rfps} = get_rfp() + {:error, reason, rfps} + end + end + + def process_files() do + # Only do work if RFP uploaded + {:ok, rfp} = get_rfp() + + if Enum.empty?(rfp) do + {:error, :rfp_missing} + else + # Start work for all files + # Reset results since new job started + ResultsServer.reset() + + res = Node.connect(@gpu_node) + Logger.debug("GPU Node connection: #{inspect(res)}") + + case get_files() do + {:ok, files} -> + Enum.each(files, fn filepath -> + full_path = Path.join(@output_dir, filepath) + spawn(fn -> FileProcessor.Impl.handle_file(full_path) end) + end) + + {:error, reason} -> + Logger.error("Failed to read files dir: #{reason}") + end + + # Start work for RFP + case get_rfp() do + {:ok, files} -> + Enum.each(files, fn filepath -> + full_path = Path.join(@rfp_dir, filepath) + spawn(fn -> FileProcessor.Impl.handle_rfp(full_path) end) + end) + + {:error, reason} -> + Logger.error("Failed to read rfp file dir: #{reason}") + end + + {:ok} + end + end + + def handle_file(filepath) do + Logger.debug("Processing #{filepath}") + + filetype = get_filetype(filepath) + + text = extract_text(filetype, filepath) + #Logger.debug("Got text from #{filepath}: #{inspect(text)}") + + chunks = + case text do + {:error} -> + # Send failure to results server + [] + + good_text -> + chunks = TextChunker.split(good_text, chunk_size: 100, chunk_overlap: 10) + #Logger.debug("Chunks: #{inspect(chunks)}") + chunks + end + + all_chunks = + Enum.map(chunks, fn chunk -> + # Logger.debug("Chunk: #{chunk.text}") + embedding = GenServer.call({BertEmbedding, @gpu_node}, {:embed, chunk.text}, :infinity) + # Logger.debug("Embedding: #{inspect(embedding)}") + %ChunkResult{chunk: chunk, embedding: embedding} + end) + + results = %FileResult{filename: Path.basename(filepath), chunks: all_chunks} + #Logger.debug("Result done struct = #{inspect(results)}") + ResultsServer.put_result(results) + end + + def handle_rfp(filepath) do + Logger.debug("Processing RFP #{filepath}") + end + + defp get_filetype(filepath) do + res = FileType.from_path(filepath) + Logger.debug("Filetype = #{inspect(res)} for #{filepath}") + + case res do + {:ok, {type, _}} -> + type + + {:error, _} -> + if String.ends_with?(filepath, "txt") do + "txt" + else + {:error} + end + end + end + + ############################################# + # Extract text functions for each filetype + ############################################# + + defp extract_text("txt", filepath) do + Logger.debug("Extracting from text file: #{filepath}") + + case File.read(filepath) do + {:ok, content} -> + content + + {:error, reason} -> + Logger.error("Error reading file: #{reason}") + {:error} + end + end + + defp extract_text("pdf", filepath) do + Logger.debug("Extracting from pdf file: #{filepath}") + + case PdfExtractor.extract_text_timeout(filepath, :infinity) do + {:ok, res} -> + values = Map.values(res) + Enum.join(values, " ") + + {:error, reason} -> + Logger.error("Error extracting pdf: #{filepath}, #{inspect(reason)}") + {:error} + + _ -> + Logger.error("Error surprsing value from pdf extraction: #{filepath}") + {:error} + end + end + + defp extract_text(unk, filepath) do + Logger.error("Unknown filetype for file: #{filepath}, received: #{inspect(unk)}") + {:error} + end + + ####################################### +end diff --git a/file_processor/lib/file_processor/structs.ex b/file_processor/lib/file_processor/structs.ex new file mode 100644 index 0000000..33f2431 --- /dev/null +++ b/file_processor/lib/file_processor/structs.ex @@ -0,0 +1,7 @@ +defmodule FileResult do + defstruct filename: "", chunks: [] +end + +defmodule ChunkResult do + defstruct chunk: %TextChunker.Chunk{}, embedding: %{} +end diff --git a/file_processor/lib/results_server.ex b/file_processor/lib/results_server.ex new file mode 100644 index 0000000..2b7744d --- /dev/null +++ b/file_processor/lib/results_server.ex @@ -0,0 +1,89 @@ +defmodule ResultsServer do + require Logger + use GenServer + +@moduledoc """ +Genserver with data: +%{results => %{filename->FileResult, filename2->FileResult}, +good => [filename1, filename2], +bad => [badfiles]} +""" + + # Client + # + def start_link(initial_values) do + GenServer.start_link(__MODULE__, initial_values, name: ResultsServer) + end + + @impl true + def init(initial_values) do + {:ok, initial_values} + end + + def put_result(file_result) do + GenServer.cast(ResultsServer, {:put_result, file_result}) + end + + def print_results() do + GenServer.cast(ResultsServer, :print_results) + end + + def reset() do + GenServer.cast(ResultsServer, :reset) + end + + # Server + + @impl true + def handle_cast({:put_result, file_result}, current_state) do + + filename = file_result.filename + chunks = file_result.chunks + + Logger.debug("Got result: #{inspect(filename)}") + + {new_state, status} = + case chunks do + [] -> + {Map.put(current_state, :bad, current_state.bad ++ [filename]), :bad} + _ -> + new_state = Map.put(current_state, :good, current_state.good ++ [filename]) + results = new_state.results + new_results = Map.put(results, filename, file_result) + {Map.put(new_state, :results, new_results), :good} + end + + done_files = new_state.bad ++ new_state.good + {:ok, all_files} = FileProcessor.get_files() + + remaining = all_files -- done_files + p = length(done_files)/length(all_files) + Phoenix.PubSub.broadcast(TrustedEdgeServer.PubSub, "the_topic", {:new_result, {filename, status}}) + Phoenix.PubSub.broadcast(TrustedEdgeServer.PubSub, "the_topic", {:new_percent_done, p}) + Logger.info("Percent done: #{p}") + Logger.info("Remaining: #{inspect(remaining)}") + + {:noreply, new_state} + end + + @impl true + def handle_cast(:print_results, current_state) do + + good = current_state.good + bad = current_state.bad + results = current_state.results + Logger.debug("Good: #{inspect(good)}") + Logger.debug("Bad: #{inspect(bad)}") + Enum.each(results, fn {key, value} -> + Logger.debug("File: #{key}") + Logger.debug("Chunks: #{inspect(value.chunks)}") + end) + {:noreply, current_state} + end + + @impl true + def handle_cast(:reset, current_state) do + {:noreply, %{:results=>%{}, :good=>[], :bad=>[]}} + end + +end diff --git a/file_processor/mix.exs b/file_processor/mix.exs new file mode 100644 index 0000000..17f83af --- /dev/null +++ b/file_processor/mix.exs @@ -0,0 +1,31 @@ +defmodule FileProcessor.MixProject do + use Mix.Project + + def project do + [ + app: :file_processor, + version: "0.1.0", + elixir: "~> 1.18", + start_permanent: Mix.env() == :prod, + deps: deps() + ] + end + + # Run "mix help compile.app" to learn about applications. + def application do + [ + extra_applications: [:logger, :runtime_tools, :wx, :observer], + mod: {FileProcessor.Application, []} + ] + end + + # Run "mix help deps" to learn about dependencies. + defp deps do + [ + {:file_type, "~> 0.1"}, + {:text_chunker, "~> 0.5.2"}, + {:pdf_extractor, "~> 0.5.0"}, + {:phoenix_pubsub, "~> 2.0"} + ] + end +end diff --git a/file_processor/mix.lock b/file_processor/mix.lock new file mode 100644 index 0000000..9126d62 --- /dev/null +++ b/file_processor/mix.lock @@ -0,0 +1,11 @@ +%{ + "cc_precompiler": {:hex, :cc_precompiler, "0.1.11", "8c844d0b9fb98a3edea067f94f616b3f6b29b959b6b3bf25fee94ffe34364768", [:mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "3427232caf0835f94680e5bcf082408a70b48ad68a5f5c0b02a3bea9f3a075b9"}, + "elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"}, + "file_type": {:hex, :file_type, "0.1.0", "6197174cae5485468fc0cf0244200129e16748aba0f1ceee9b8792e4fef0f03f", [:mix], [], "hexpm", "650b8a933592b004e1c57ad03080e9ef074171764c44e36af14199ae9a408c88"}, + "fine": {:hex, :fine, "0.1.4", "b19a89c1476c7c57afb5f9314aed5960b5bc95d5277de4cb5ee8e1d1616ce379", [:mix], [], "hexpm", "be3324cc454a42d80951cf6023b9954e9ff27c6daa255483b3e8d608670303f5"}, + "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, + "pdf_extractor": {:hex, :pdf_extractor, "0.5.0", "eaa9c11e936a34785aff5896ed3aca0d109bc21b02eac71e956a5478726a49de", [:mix], [{:pythonx, "~> 0.4.4", [hex: :pythonx, repo: "hexpm", optional: false]}], "hexpm", "1119734ce83ad974af2225e5b89e91ddad0876f3c29242df1e479276d12b7576"}, + "phoenix_pubsub": {:hex, :phoenix_pubsub, "2.2.0", "ff3a5616e1bed6804de7773b92cbccfc0b0f473faf1f63d7daf1206c7aeaaa6f", [:mix], [], "hexpm", "adc313a5bf7136039f63cfd9668fde73bba0765e0614cba80c06ac9460ff3e96"}, + "pythonx": {:hex, :pythonx, "0.4.7", "604a3a78377abdaa8739c561cb871c856b0e80d25fd057277839912017004af0", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.9", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:fine, "~> 0.1.2", [hex: :fine, repo: "hexpm", optional: false]}], "hexpm", "20d8b456df995e6ccd6d88dcf118ba80464194515f71a5c89aacdb824d235c52"}, + "text_chunker": {:hex, :text_chunker, "0.5.2", "39da4765846eabc222fde0f76e68111eee3b17bf826345085cd829dee9950402", [:mix], [{:nimble_options, "~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}], "hexpm", "d5c180a9deb4ad9a9864b6c0d8bbfaf2d26bede59887d3af23cbfae551928a95"}, +} diff --git a/file_processor/start.txt b/file_processor/start.txt new file mode 100644 index 0000000..136a9f9 --- /dev/null +++ b/file_processor/start.txt @@ -0,0 +1 @@ +iex --sname fp -S mix diff --git a/file_processor/test/file_processor_test.exs b/file_processor/test/file_processor_test.exs new file mode 100644 index 0000000..111b5f4 --- /dev/null +++ b/file_processor/test/file_processor_test.exs @@ -0,0 +1,8 @@ +defmodule FileProcessorTest do + use ExUnit.Case + doctest FileProcessor + + test "greets the world" do + assert FileProcessor.hello() == :world + end +end diff --git a/file_processor/test/test_helper.exs b/file_processor/test/test_helper.exs new file mode 100644 index 0000000..869559e --- /dev/null +++ b/file_processor/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start()