Initial.
This commit is contained in:
4
file_processor/.formatter.exs
Normal file
4
file_processor/.formatter.exs
Normal file
@@ -0,0 +1,4 @@
|
||||
# Used by "mix format"
|
||||
[
|
||||
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
|
||||
]
|
||||
12
file_processor/.gitignore
vendored
Normal file
12
file_processor/.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
# ---> Elixir
|
||||
/_build
|
||||
/cover
|
||||
/deps
|
||||
/doc
|
||||
/.fetch
|
||||
erl_crash.dump
|
||||
*.ez
|
||||
*.beam
|
||||
/config/*.secret.exs
|
||||
.elixir_ls/
|
||||
|
||||
21
file_processor/README.md
Normal file
21
file_processor/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# FileProcessor
|
||||
|
||||
**TODO: Add description**
|
||||
|
||||
## Installation
|
||||
|
||||
If [available in Hex](https://hex.pm/docs/publish), the package can be installed
|
||||
by adding `file_processor` to your list of dependencies in `mix.exs`:
|
||||
|
||||
```elixir
|
||||
def deps do
|
||||
[
|
||||
{:file_processor, "~> 0.1.0"}
|
||||
]
|
||||
end
|
||||
```
|
||||
|
||||
Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc)
|
||||
and published on [HexDocs](https://hexdocs.pm). Once published, the docs can
|
||||
be found at <https://hexdocs.pm/file_processor>.
|
||||
|
||||
16
file_processor/config/config.exs
Normal file
16
file_processor/config/config.exs
Normal file
@@ -0,0 +1,16 @@
|
||||
import Config
|
||||
|
||||
config :file_processor,
|
||||
filepath: "/tmp/myfiles",
|
||||
rfppath: "/tmp/rfpfile"
|
||||
|
||||
config :file_processor, FileProcessor.Application,
|
||||
pubsub: [
|
||||
phoenix_pubsub: [
|
||||
adapter: Phoenix.PubSub.PG2,
|
||||
pool_size: 1
|
||||
]
|
||||
]
|
||||
|
||||
# Only need this when we have different envs
|
||||
# import_config "#{config_env()}.exs"
|
||||
BIN
file_processor/dep_pdf_extractor_mod/.hex
Normal file
BIN
file_processor/dep_pdf_extractor_mod/.hex
Normal file
Binary file not shown.
79
file_processor/dep_pdf_extractor_mod/CHANGELOG.md
Normal file
79
file_processor/dep_pdf_extractor_mod/CHANGELOG.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.5.0] - 2025-08-23
|
||||
|
||||
- Simplify arguments to avoid repetition
|
||||
- Upgrade Python to v3.12 and pdfplumber to v0.11.7
|
||||
- Lower Elixir requirement to v1.15
|
||||
|
||||
## [0.4.1] - 2025-07-30
|
||||
|
||||
- Fix `PdfExtractor.start_link/1` call to link the process correctly to the supervisor tree
|
||||
|
||||
## [0.4.0] - 2025-07-21
|
||||
|
||||
### Changed
|
||||
- Made PdfExtractor a single process to avoid issues with the Python GIL
|
||||
|
||||
## [0.3.0] - 2025-07-20
|
||||
|
||||
### Added
|
||||
- **Multiple Areas Support**: Extract text from multiple bounding box areas on the same page
|
||||
- **Metadata Extraction**: New `extract_metadata/1` and `extract_metadata_from_binary/1` functions
|
||||
- **Binary PDF Processing**: Extract text and metadata directly from PDF binary data
|
||||
- **Enhanced Documentation**: Comprehensive doctests and improved API documentation
|
||||
- **Improved Test Coverage**: Added extensive test suite for new functionality
|
||||
|
||||
### Changed
|
||||
- Enhanced area-based extraction to support lists of areas per page
|
||||
- Improved error handling and edge case management
|
||||
- Updated type specifications for better developer experience
|
||||
|
||||
### Fixed
|
||||
- Better handling of invalid page numbers and area coordinates
|
||||
- Improved Python environment initialization
|
||||
|
||||
## [0.2.1] - 2025-06-27
|
||||
|
||||
### Fixed
|
||||
- Added automatic Python dependencies download and installation
|
||||
- Improved application startup process
|
||||
|
||||
## [0.2.0] - 2025-06-22
|
||||
|
||||
### Added
|
||||
- Project badges and improved README documentation
|
||||
- Enhanced configuration and documentation setup
|
||||
|
||||
### Changed
|
||||
|
||||
- Improved function naming and API consistency
|
||||
- Better documentation structure
|
||||
|
||||
## [0.1.0] - 2025-06-21
|
||||
|
||||
### Added
|
||||
- Initial release of PdfExtractor
|
||||
- Support for extracting text from PDF files using Python's pdfplumber
|
||||
- Single page text extraction
|
||||
- Multi-page text extraction
|
||||
- Basic area-based text extraction with bounding boxes
|
||||
- Initial test suite
|
||||
- Basic documentation and examples
|
||||
|
||||
### Dependencies
|
||||
- pythonx ~> 0.4.0 for Python integration
|
||||
- Requires Python with pdfplumber package installed
|
||||
|
||||
[Unreleased]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.3.0...HEAD
|
||||
[0.3.0]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.2.1...v0.3.0
|
||||
[0.2.1]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.2.0...v0.2.1
|
||||
[0.2.0]: https://github.com/YOUR_USERNAME/pdf_extractor/compare/v0.1.0...v0.2.0
|
||||
[0.1.0]: https://github.com/YOUR_USERNAME/pdf_extractor/releases/tag/v0.1.0
|
||||
21
file_processor/dep_pdf_extractor_mod/LICENSE
Normal file
21
file_processor/dep_pdf_extractor_mod/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Nelson Estevão
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
91
file_processor/dep_pdf_extractor_mod/README.md
Normal file
91
file_processor/dep_pdf_extractor_mod/README.md
Normal file
@@ -0,0 +1,91 @@
|
||||
# PdfExtractor
|
||||
|
||||
[](https://hex.pm/packages/pdf_extractor)
|
||||
[](https://hexdocs.pm/pdf_extractor)
|
||||
[](https://hex.pm/packages/pdf_extractor)
|
||||
[](https://hex.pm/packages/pdf_extractor)
|
||||
[](https://github.com/nelsonmestevao/pdf_extractor)
|
||||
|
||||
|
||||
A powerful and easy-to-use Elixir library for extracting text and metadata from PDF files.
|
||||
|
||||
PdfExtractor leverages Python's `pdfplumber` library through seamless integration to provide
|
||||
robust PDF text extraction capabilities. It supports both file-based and binary-based operations,
|
||||
making it suitable for various use cases from local file processing to web-based PDF handling.
|
||||
|
||||
## Features
|
||||
|
||||
- 🔍 Extract text from single or multiple PDF pages
|
||||
- 📍 Area-based extraction using bounding boxes
|
||||
- 🌐 Work with PDF data directly from memory (e.g., HTTP downloads)
|
||||
- 📊 Get PDF metadata like title, author, creation date
|
||||
- 🐍 Leverages Python's powerful `pdfplumber` library
|
||||
- 🚀 Simple and intuitive API
|
||||
- ✅ Comprehensive test coverage
|
||||
- 📚 Full documentation
|
||||
|
||||
## Installation
|
||||
|
||||
Add `pdf_extractor` to your list of dependencies in `mix.exs`:
|
||||
|
||||
```elixir
|
||||
def deps do
|
||||
[
|
||||
{:pdf_extractor, "~> 0.5.0"}
|
||||
]
|
||||
end
|
||||
```
|
||||
|
||||
Then start it in your application start function:
|
||||
|
||||
```elixir
|
||||
defmodule MyApp.Application do
|
||||
use Application
|
||||
|
||||
def start(_type, _args) do
|
||||
children = [
|
||||
PdfExtractor,
|
||||
...
|
||||
]
|
||||
|
||||
opts = [strategy: :one_for_one, name: MyApp.Supervisor]
|
||||
Supervisor.start_link(children, opts)
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Extract text from specific regions using bounding boxes `{x0, y0, x1, y1}`:
|
||||
|
||||
```elixir
|
||||
areas = %{
|
||||
0 => {0, 0, 300, 200}, # Top-left area of page 0
|
||||
1 => [
|
||||
{200, 300, 600, 500}, # Bottom-right area of page 1
|
||||
{0, 0, 200, 250}, # Top-left area of page 1
|
||||
]
|
||||
}
|
||||
PdfExtractor.extract_text("path/to/document.pdf", areas)
|
||||
```
|
||||
|
||||
### Return Format
|
||||
|
||||
The function returns a map where keys are page numbers and values are the extracted text:
|
||||
|
||||
```elixir
|
||||
%{
|
||||
0 => "Text from page 0...",
|
||||
1 => ["Text from page 1 (first area)...", "Text from page 1 (second area)..."],
|
||||
2 => "Text from page 2..."
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
- Built on top of the excellent [pdfplumber](https://github.com/jsvine/pdfplumber) Python library
|
||||
- Uses [pythonx](https://github.com/livebook-dev/pythonx) for seamless Python integration
|
||||
22
file_processor/dep_pdf_extractor_mod/hex_metadata.config
Normal file
22
file_processor/dep_pdf_extractor_mod/hex_metadata.config
Normal file
@@ -0,0 +1,22 @@
|
||||
{<<"links">>,
|
||||
[{<<"Changelog">>,
|
||||
<<"https://github.com/nelsonmestevao/pdf_extractor/blob/main/CHANGELOG.md">>},
|
||||
{<<"GitHub">>,<<"https://github.com/nelsonmestevao/pdf_extractor">>}]}.
|
||||
{<<"name">>,<<"pdf_extractor">>}.
|
||||
{<<"version">>,<<"0.5.0">>}.
|
||||
{<<"description">>,
|
||||
<<"A lightweight Elixir library for extracting text from PDF files using Python's pdfplumber.\nSupports single and multi-page extraction with optional area filtering.">>}.
|
||||
{<<"elixir">>,<<"~> 1.15">>}.
|
||||
{<<"app">>,<<"pdf_extractor">>}.
|
||||
{<<"files">>,
|
||||
[<<"lib">>,<<"lib/pdf_extractor">>,<<"lib/pdf_extractor/pdf_plumber.ex">>,
|
||||
<<"lib/pdf_extractor.ex">>,<<"mix.exs">>,<<"README.md">>,<<"LICENSE">>,
|
||||
<<"CHANGELOG.md">>]}.
|
||||
{<<"licenses">>,[<<"MIT">>]}.
|
||||
{<<"requirements">>,
|
||||
[[{<<"name">>,<<"pythonx">>},
|
||||
{<<"app">>,<<"pythonx">>},
|
||||
{<<"optional">>,false},
|
||||
{<<"requirement">>,<<"~> 0.4.4">>},
|
||||
{<<"repository">>,<<"hexpm">>}]]}.
|
||||
{<<"build_tools">>,[<<"mix">>]}.
|
||||
228
file_processor/dep_pdf_extractor_mod/lib/pdf_extractor.ex
Normal file
228
file_processor/dep_pdf_extractor_mod/lib/pdf_extractor.ex
Normal file
@@ -0,0 +1,228 @@
|
||||
defmodule PdfExtractor do
|
||||
@moduledoc "README.md"
|
||||
|> File.read!()
|
||||
|> String.split("\n\n")
|
||||
|> tl()
|
||||
|> tl()
|
||||
|> Enum.join("\n\n")
|
||||
use GenServer
|
||||
|
||||
@external_resource "README.md"
|
||||
|
||||
# Client
|
||||
|
||||
def start_link(opts \\ []) do
|
||||
opts = Keyword.validate!(opts, name: __MODULE__)
|
||||
GenServer.start_link(__MODULE__, [], name: opts[:name])
|
||||
end
|
||||
|
||||
@doc ~S"""
|
||||
Extracts text from PDF pages.
|
||||
|
||||
It supports extracting from single pages, multiple pages, and specific areas within pages.
|
||||
|
||||
## Page Numbers
|
||||
|
||||
- **Integer**: Extract from single page (e.g., `0` for first page)
|
||||
- **List**: Extract from multiple pages (e.g., `[0, 1, 2]`)
|
||||
- **Empty list** `[]`: Extract from all pages (default)
|
||||
|
||||
## Areas Format
|
||||
|
||||
Areas are specified as a map where keys are page numbers and values are bounding boxes:
|
||||
|
||||
- **Single area**: `%{0 => {x0, y0, x1, y1}}`
|
||||
- **Multiple areas**: `%{0 => [{x0, y0, x1, y1}, {x2, y2, x3, y3}]}`
|
||||
- **Mixed**: `%{0 => {x0, y0, x1, y1}, 1 => [{x2, y2, x3, y3}, {x4, y4, x5, y5}]}`
|
||||
|
||||
## Examples
|
||||
|
||||
Extract text from all pages.
|
||||
|
||||
iex> PdfExtractor.extract_text("priv/fixtures/fatura.pdf")
|
||||
{:ok,
|
||||
%{
|
||||
0 =>
|
||||
"Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €",
|
||||
1 =>
|
||||
"✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point"
|
||||
}}
|
||||
|
||||
Extract text from only some pages.
|
||||
|
||||
iex> PdfExtractor.extract_text("priv/fixtures/fatura.pdf", [0])
|
||||
{:ok,
|
||||
%{
|
||||
0 =>
|
||||
"Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €"
|
||||
}}
|
||||
|
||||
Extract only the titles in the book chapters.
|
||||
|
||||
iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{
|
||||
...> 2 => {0, 0, 612, 190},
|
||||
...> 8 => {0, 0, 612, 190},
|
||||
...> 10 => {0, 0, 612, 190}
|
||||
...> })
|
||||
{:ok,
|
||||
%{
|
||||
2 => "Introdução – Nota do tradutor",
|
||||
8 => "I. Sobre aproveitar o tempo",
|
||||
10 => "II. Sobre a falta de foco na Leitura"
|
||||
}}
|
||||
|
||||
Extract multiple areas from a single page.
|
||||
|
||||
iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{
|
||||
...> 1 => [{0, 100, 612, 140}, {0, 400, 612, 440}]
|
||||
...> })
|
||||
{:ok,
|
||||
%{
|
||||
1 => [
|
||||
"CARTAS DE UM ESTOICO, Volume I",
|
||||
"Montecristo Editora Ltda.\ne-mail: editora@montecristoeditora.com.br"
|
||||
]
|
||||
}}
|
||||
"""
|
||||
def extract_text(file_path, pages \\ []) do
|
||||
GenServer.call(__MODULE__, {:extract_text, [file_path, pages]})
|
||||
end
|
||||
|
||||
def extract_text_timeout(file_path, pages \\ [], timeout) do
|
||||
GenServer.call(__MODULE__, {:extract_text, [file_path, pages]}, timeout)
|
||||
end
|
||||
|
||||
@doc ~S"""
|
||||
Extracts text from PDF binary data. See `extract_text/3` for details on how to specify pages and areas.
|
||||
|
||||
This function allows you to extract text from PDF data that's already in memory,
|
||||
such as data downloaded from a URL or received via an API. This avoids the need
|
||||
to write the PDF to the filesystem.
|
||||
|
||||
## Examples
|
||||
|
||||
Extract text from all pages.
|
||||
|
||||
iex> content = File.read!("priv/fixtures/fatura.pdf")
|
||||
...> PdfExtractor.extract_text_from_binary(content)
|
||||
{:ok,
|
||||
%{
|
||||
0 =>
|
||||
"Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €",
|
||||
1 =>
|
||||
"✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point"
|
||||
}}
|
||||
|
||||
Extract text from only some pages.
|
||||
|
||||
iex> content = File.read!("priv/fixtures/fatura.pdf")
|
||||
...> PdfExtractor.extract_text_from_binary(content, [0])
|
||||
{:ok,
|
||||
%{
|
||||
0 =>
|
||||
"Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €"
|
||||
}}
|
||||
|
||||
Extract only the titles in the book chapters.
|
||||
|
||||
iex> content = File.read!("priv/fixtures/book.pdf")
|
||||
...>
|
||||
...> PdfExtractor.extract_text_from_binary(content, %{
|
||||
...> 2 => {0, 0, 612, 190},
|
||||
...> 8 => {0, 0, 612, 190},
|
||||
...> 10 => {0, 0, 612, 190}
|
||||
...> })
|
||||
{:ok,
|
||||
%{
|
||||
2 => "Introdução – Nota do tradutor",
|
||||
8 => "I. Sobre aproveitar o tempo",
|
||||
10 => "II. Sobre a falta de foco na Leitura"
|
||||
}}
|
||||
|
||||
Extract multiple areas from a single page.
|
||||
|
||||
iex> content = File.read!("priv/fixtures/book.pdf")
|
||||
...>
|
||||
...> PdfExtractor.extract_text_from_binary(content, %{
|
||||
...> 1 => [{0, 100, 612, 140}, {0, 400, 612, 440}]
|
||||
...> })
|
||||
{:ok,
|
||||
%{
|
||||
1 => [
|
||||
"CARTAS DE UM ESTOICO, Volume I",
|
||||
"Montecristo Editora Ltda.\ne-mail: editora@montecristoeditora.com.br"
|
||||
]
|
||||
}}
|
||||
|
||||
"""
|
||||
def extract_text_from_binary(binary, pages \\ []) do
|
||||
GenServer.call(__MODULE__, {:extract_text_from_binary, [binary, pages]})
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extracts metadata from a PDF file info trailers. Typically includes "CreationDate", "ModDate", "Producer", et cetera.
|
||||
|
||||
## Examples
|
||||
|
||||
iex> PdfExtractor.extract_metadata("priv/fixtures/book.pdf")
|
||||
{:ok,
|
||||
%{
|
||||
"CreationDate" => "D:20250718212328Z",
|
||||
"Creator" => "Stirling-PDF v0.44.2",
|
||||
"ModDate" => "D:20250718212328Z",
|
||||
"Producer" => "Stirling-PDF v0.44.2"
|
||||
}}
|
||||
|
||||
"""
|
||||
def extract_metadata(file_path) do
|
||||
GenServer.call(__MODULE__, {:extract_metadata, [file_path]})
|
||||
end
|
||||
|
||||
@doc """
|
||||
Extracts metadata from PDF binary data. Similar to `extract_metadata/1` but works with PDF data in memory instead of
|
||||
files.
|
||||
|
||||
## Examples
|
||||
|
||||
iex> content = File.read!("priv/fixtures/book.pdf")
|
||||
...> PdfExtractor.extract_metadata_from_binary(content)
|
||||
{:ok,
|
||||
%{
|
||||
"CreationDate" => "D:20250718212328Z",
|
||||
"Creator" => "Stirling-PDF v0.44.2",
|
||||
"ModDate" => "D:20250718212328Z",
|
||||
"Producer" => "Stirling-PDF v0.44.2"
|
||||
}}
|
||||
|
||||
"""
|
||||
def extract_metadata_from_binary(binary) do
|
||||
GenServer.call(__MODULE__, {:extract_metadata_from_binary, [binary]})
|
||||
end
|
||||
|
||||
# Server
|
||||
|
||||
@doc false
|
||||
@impl true
|
||||
def init([] = state) do
|
||||
try do
|
||||
:ok = PdfExtractor.PdfPlumber.start()
|
||||
rescue
|
||||
e in RuntimeError ->
|
||||
if e.message =~ ~r/Python interpreter has already been initialized/ do
|
||||
:ok
|
||||
else
|
||||
reraise e, __STACKTRACE__
|
||||
end
|
||||
end
|
||||
|
||||
{:ok, state}
|
||||
end
|
||||
|
||||
@doc false
|
||||
@impl true
|
||||
def handle_call({function, args}, _from, state) when is_atom(function) and is_list(args) do
|
||||
{:reply, {:ok, apply(PdfExtractor.PdfPlumber, function, args)}, state}
|
||||
rescue
|
||||
exception in Pythonx.Error -> {:reply, {:error, exception}, state}
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,189 @@
|
||||
defmodule PdfExtractor.PdfPlumber do
|
||||
@moduledoc false
|
||||
|
||||
def start do
|
||||
Pythonx.uv_init("""
|
||||
[project]
|
||||
name = "pdf_extractor"
|
||||
version = "#{to_string(version())}"
|
||||
requires-python = "==3.12.*"
|
||||
dependencies = [
|
||||
"pdfplumber==0.11.7"
|
||||
]
|
||||
""")
|
||||
end
|
||||
|
||||
@type area :: {non_neg_integer(), non_neg_integer(), non_neg_integer(), non_neg_integer()}
|
||||
@type page :: non_neg_integer()
|
||||
|
||||
@spec extract_text(
|
||||
file_path :: String.t(),
|
||||
pages :: page() | list(page()) | %{page() => area() | [area()] | nil}
|
||||
) :: %{page() => String.t() | list(String.t())}
|
||||
def extract_text(file_path, page_number) when is_integer(page_number) do
|
||||
extract_text(file_path, List.wrap(page_number))
|
||||
end
|
||||
|
||||
def extract_text(file_path, pages) when is_list(pages) do
|
||||
"""
|
||||
#{python_extract_code()}
|
||||
|
||||
main(file_path.decode('utf-8'), page_numbers, areas)
|
||||
"""
|
||||
|> Pythonx.eval(%{
|
||||
"file_path" => file_path,
|
||||
"page_numbers" => pages,
|
||||
"areas" => %{}
|
||||
})
|
||||
|> elem(0)
|
||||
|> Pythonx.decode()
|
||||
|> to_map(pages)
|
||||
end
|
||||
|
||||
def extract_text(file_path, pages) when is_map(pages) do
|
||||
"""
|
||||
#{python_extract_code()}
|
||||
|
||||
main(file_path.decode('utf-8'), page_numbers, areas)
|
||||
"""
|
||||
|> Pythonx.eval(%{
|
||||
"file_path" => file_path,
|
||||
"page_numbers" => Map.keys(pages),
|
||||
"areas" => pages
|
||||
})
|
||||
|> elem(0)
|
||||
|> Pythonx.decode()
|
||||
|> to_map(Map.keys(pages))
|
||||
end
|
||||
|
||||
@doc """
|
||||
This version avoids the need to put the pdf on a filesystem.
|
||||
This allows this to work
|
||||
url = "https://erlang.org/download/armstrong_thesis_2003.pdf"
|
||||
url |> :httpc.request() |> elem(1) |> elem(2) |> :binary.list_to_bin() |> PdfExtractor.extract_text_from_binary()
|
||||
"""
|
||||
def extract_text_from_binary(binary, page_number) when is_integer(page_number) do
|
||||
extract_text_from_binary(binary, List.wrap(page_number))
|
||||
end
|
||||
|
||||
def extract_text_from_binary(binary, pages) when is_list(pages) do
|
||||
"""
|
||||
from io import BytesIO
|
||||
|
||||
#{python_extract_code()}
|
||||
|
||||
main(BytesIO(binary), page_numbers, areas)
|
||||
"""
|
||||
|> Pythonx.eval(%{
|
||||
"binary" => binary,
|
||||
"page_numbers" => pages,
|
||||
"areas" => %{}
|
||||
})
|
||||
|> elem(0)
|
||||
|> Pythonx.decode()
|
||||
|> to_map(pages)
|
||||
end
|
||||
|
||||
def extract_text_from_binary(binary, pages) when is_map(pages) do
|
||||
"""
|
||||
from io import BytesIO
|
||||
|
||||
#{python_extract_code()}
|
||||
|
||||
main(BytesIO(binary), page_numbers, areas)
|
||||
"""
|
||||
|> Pythonx.eval(%{
|
||||
"binary" => binary,
|
||||
"page_numbers" => Map.keys(pages),
|
||||
"areas" => pages
|
||||
})
|
||||
|> elem(0)
|
||||
|> Pythonx.decode()
|
||||
|> to_map(Map.keys(pages))
|
||||
end
|
||||
|
||||
defp python_extract_code do
|
||||
"""
|
||||
import pdfplumber
|
||||
import logging
|
||||
|
||||
logging.getLogger("pdfminer").setLevel(logging.ERROR)
|
||||
|
||||
def extract_from_page(page, areas=None):
|
||||
if areas is None:
|
||||
return page.extract_text()
|
||||
elif isinstance(areas, list):
|
||||
return [page.within_bbox(area).extract_text() for area in areas]
|
||||
else:
|
||||
return page.within_bbox(areas).extract_text()
|
||||
|
||||
def main(content, page_numbers, areas):
|
||||
results = []
|
||||
with pdfplumber.open(content) as pdf:
|
||||
total_pages = len(pdf.pages)
|
||||
if page_numbers == []:
|
||||
page_numbers = list(range(total_pages))
|
||||
for page_number in page_numbers:
|
||||
if page_number >= 0 and page_number < total_pages:
|
||||
results.append(extract_from_page(pdf.pages[page_number], areas.get(page_number)))
|
||||
return results
|
||||
"""
|
||||
end
|
||||
|
||||
def extract_metadata(file_path) do
|
||||
"""
|
||||
#{python_extract_metadata_code()}
|
||||
|
||||
main(file_path.decode('utf-8'))
|
||||
"""
|
||||
|> Pythonx.eval(%{
|
||||
"file_path" => file_path
|
||||
})
|
||||
|> elem(0)
|
||||
|> Pythonx.decode()
|
||||
end
|
||||
|
||||
def extract_metadata_from_binary(binary) do
|
||||
"""
|
||||
from io import BytesIO
|
||||
|
||||
#{python_extract_metadata_code()}
|
||||
|
||||
main(BytesIO(binary))
|
||||
"""
|
||||
|> Pythonx.eval(%{
|
||||
"binary" => binary
|
||||
})
|
||||
|> elem(0)
|
||||
|> Pythonx.decode()
|
||||
end
|
||||
|
||||
defp python_extract_metadata_code do
|
||||
"""
|
||||
import pdfplumber
|
||||
import logging
|
||||
|
||||
logging.getLogger("pdfminer").setLevel(logging.ERROR)
|
||||
|
||||
def main(content):
|
||||
with pdfplumber.open(content) as pdf:
|
||||
return pdf.metadata
|
||||
"""
|
||||
end
|
||||
|
||||
defp to_map(texts, []) when is_list(texts) do
|
||||
texts
|
||||
|> Enum.with_index(&{&2, &1})
|
||||
|> Map.new()
|
||||
end
|
||||
|
||||
defp to_map(texts, page_numbers) when is_list(texts) do
|
||||
page_numbers
|
||||
|> Enum.zip(texts)
|
||||
|> Map.new()
|
||||
end
|
||||
|
||||
defp version do
|
||||
Application.spec(:pdf_extractor, :vsn)
|
||||
end
|
||||
end
|
||||
90
file_processor/dep_pdf_extractor_mod/mix.exs
Normal file
90
file_processor/dep_pdf_extractor_mod/mix.exs
Normal file
@@ -0,0 +1,90 @@
|
||||
defmodule PdfExtractor.MixProject do
|
||||
use Mix.Project
|
||||
|
||||
@app :pdf_extractor
|
||||
@name "PdfExtractor"
|
||||
@version "0.5.0"
|
||||
@source_url "https://github.com/nelsonmestevao/pdf_extractor"
|
||||
|
||||
def project do
|
||||
[
|
||||
name: @name,
|
||||
app: @app,
|
||||
version: @version,
|
||||
elixir: "~> 1.15",
|
||||
start_permanent: Mix.env() == :prod,
|
||||
deps: deps(),
|
||||
description: description(),
|
||||
package: package(),
|
||||
docs: docs(),
|
||||
aliases: aliases(),
|
||||
dialyzer: dialyzer(),
|
||||
source_url: @source_url
|
||||
]
|
||||
end
|
||||
|
||||
def application do
|
||||
[
|
||||
extra_applications: [:logger]
|
||||
]
|
||||
end
|
||||
|
||||
defp deps do
|
||||
[
|
||||
{:pythonx, "~> 0.4.4"},
|
||||
|
||||
# tools
|
||||
{:credo, "~> 1.7", only: [:dev, :test], runtime: false},
|
||||
{:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
|
||||
{:doctest_formatter, "~> 0.4.0", only: [:dev, :test], runtime: false},
|
||||
{:ex_doc, "~> 0.38", only: :dev, runtime: false},
|
||||
{:styler, "~> 1.0", only: [:dev, :test], runtime: false}
|
||||
]
|
||||
end
|
||||
|
||||
defp aliases do
|
||||
[
|
||||
"lint.dialyzer": ["dialyzer --format dialyxir"]
|
||||
]
|
||||
end
|
||||
|
||||
defp description do
|
||||
"""
|
||||
A lightweight Elixir library for extracting text from PDF files using Python's pdfplumber.
|
||||
Supports single and multi-page extraction with optional area filtering.
|
||||
"""
|
||||
end
|
||||
|
||||
defp package do
|
||||
[
|
||||
name: @app,
|
||||
files: ~w(lib mix.exs README.md LICENSE CHANGELOG*),
|
||||
licenses: ["MIT"],
|
||||
links: %{
|
||||
"GitHub" => @source_url,
|
||||
"Changelog" => "#{@source_url}/blob/main/CHANGELOG.md"
|
||||
},
|
||||
maintainers: ["Nelson Estevão <nelsonmestevao@proton.me>"]
|
||||
]
|
||||
end
|
||||
|
||||
defp docs do
|
||||
[
|
||||
main: "readme",
|
||||
name: @name,
|
||||
source_ref: "v#{@version}",
|
||||
source_url: @source_url,
|
||||
extras: ["README.md", "CHANGELOG.md", "LICENSE"]
|
||||
]
|
||||
end
|
||||
|
||||
defp dialyzer do
|
||||
[
|
||||
flags: [:no_opaque],
|
||||
list_unused_filters: true,
|
||||
plt_add_deps: :apps_tree,
|
||||
plt_add_apps: [:ex_unit, :iex, :mix, :credo_naming],
|
||||
plt_file: {:no_warn, "priv/plts/elixir-#{System.version()}-erlang-otp-#{System.otp_release()}.plt"}
|
||||
]
|
||||
end
|
||||
end
|
||||
88
file_processor/lib/file_processor.ex
Normal file
88
file_processor/lib/file_processor.ex
Normal file
@@ -0,0 +1,88 @@
|
||||
defmodule FileProcessor do
|
||||
@moduledoc """
|
||||
Documentation for `FileProcessor`.
|
||||
"""
|
||||
|
||||
use GenServer
|
||||
|
||||
# Client
|
||||
|
||||
@server FileProcessor.Server
|
||||
|
||||
def start_link(initial_values) do
|
||||
GenServer.start_link(__MODULE__, initial_values, name: @server)
|
||||
end
|
||||
|
||||
def put_file({filename, contents}) do
|
||||
GenServer.call(@server, {:put_file, filename, contents})
|
||||
end
|
||||
|
||||
def get_files() do
|
||||
GenServer.call(@server, :get_files)
|
||||
end
|
||||
|
||||
def delete_file(filename) do
|
||||
GenServer.call(@server, {:delete_file, filename})
|
||||
end
|
||||
|
||||
def put_rfp({filename, contents}) do
|
||||
GenServer.call(@server, {:put_rfp, filename, contents})
|
||||
end
|
||||
|
||||
def get_rfp() do
|
||||
GenServer.call(@server, :get_rfp)
|
||||
end
|
||||
|
||||
def delete_rfp(filename) do
|
||||
GenServer.call(@server, {:delete_rfp, filename})
|
||||
end
|
||||
|
||||
def process_files() do
|
||||
GenServer.call(@server, :process_files)
|
||||
end
|
||||
|
||||
# Server
|
||||
|
||||
alias FileProcessor.Impl
|
||||
|
||||
@impl true
|
||||
def init(initial_values) do
|
||||
Impl.do_init()
|
||||
{:ok, initial_values}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call({:put_file, {filename, file}}, _from, current_state) do
|
||||
{:reply, Impl.put_file(filename, file), current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call(:get_files, _from, current_state) do
|
||||
{:reply, Impl.get_files(), current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call({:delete_file, filename}, _from, current_state) do
|
||||
{:reply, Impl.delete_file(filename), current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call({:put_rfp, {filename, file}}, _from, current_state) do
|
||||
{:reply, Impl.put_rfp(filename, file), current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call(:get_rfp, _from, current_state) do
|
||||
{:reply, Impl.get_rfp(), current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call({:delete_rfp, filename}, _from, current_state) do
|
||||
{:reply, Impl.delete_rfp(filename), current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call(:process_files, _from, current_state) do
|
||||
{:reply, Impl.process_files(), current_state}
|
||||
end
|
||||
end
|
||||
23
file_processor/lib/file_processor/application.ex
Normal file
23
file_processor/lib/file_processor/application.ex
Normal file
@@ -0,0 +1,23 @@
|
||||
defmodule FileProcessor.Application do
|
||||
# See https://hexdocs.pm/elixir/Application.html
|
||||
# for more information on OTP Applications
|
||||
@moduledoc false
|
||||
|
||||
use Application
|
||||
|
||||
@impl true
|
||||
def start(_type, _args) do
|
||||
children = [
|
||||
# Starts a worker by calling: FileProcessor.Worker.start_link(arg)
|
||||
{FileProcessor, %{}},
|
||||
PdfExtractor,
|
||||
{ResultsServer, %{:results=>%{}, :good=>[], :bad=>[]}},
|
||||
{Phoenix.PubSub, name: TrustedEdgeServer.PubSub}
|
||||
]
|
||||
|
||||
# See https://hexdocs.pm/elixir/Supervisor.html
|
||||
# for other strategies and supported options
|
||||
opts = [strategy: :one_for_one, name: FileProcessor.Supervisor]
|
||||
Supervisor.start_link(children, opts)
|
||||
end
|
||||
end
|
||||
230
file_processor/lib/file_processor/impl.ex
Normal file
230
file_processor/lib/file_processor/impl.ex
Normal file
@@ -0,0 +1,230 @@
|
||||
defmodule FileProcessor.Impl do
|
||||
require Logger
|
||||
|
||||
@output_dir Application.fetch_env!(:file_processor, :filepath)
|
||||
@rfp_dir Application.fetch_env!(:file_processor, :rfppath)
|
||||
|
||||
@gpu_node :gpu@kittykat
|
||||
|
||||
def do_init() do
|
||||
Logger.debug("Using dir: #{@output_dir} ")
|
||||
res = File.mkdir_p(@output_dir)
|
||||
Logger.debug("Result = #{res}")
|
||||
|
||||
Logger.debug("Using rfp dir: #{@rfp_dir} ")
|
||||
res = File.mkdir_p(@rfp_dir)
|
||||
Logger.debug("Result = #{res}")
|
||||
end
|
||||
|
||||
def put_file(filename, contents) do
|
||||
Logger.debug("Using dir: #{@output_dir} ")
|
||||
Logger.debug("Handling #{filename}...")
|
||||
|
||||
full_path = Path.join(@output_dir, filename)
|
||||
|
||||
case File.write(full_path, contents) do
|
||||
:ok ->
|
||||
# Return all files so liveview can update
|
||||
get_files()
|
||||
|
||||
{:error, reason} ->
|
||||
{:ok, files} = get_files()
|
||||
{:error, reason, files}
|
||||
end
|
||||
end
|
||||
|
||||
def get_files() do
|
||||
File.ls(@output_dir)
|
||||
end
|
||||
|
||||
def delete_file(filename) do
|
||||
Logger.debug("Deleting #{filename}...")
|
||||
full_path = Path.join(@output_dir, filename)
|
||||
|
||||
case File.rm(full_path) do
|
||||
:ok ->
|
||||
get_files()
|
||||
|
||||
{:error, reason} ->
|
||||
{:ok, files} = get_files()
|
||||
{:error, reason, files}
|
||||
end
|
||||
end
|
||||
|
||||
def put_rfp(filename, contents) do
|
||||
Logger.debug("Using dir: #{@rfp_dir} ")
|
||||
Logger.debug("Handling #{filename}...")
|
||||
|
||||
# Remove old one(s)
|
||||
files_to_delete = Path.wildcard(Path.join(@rfp_dir, "*"))
|
||||
Enum.each(files_to_delete, fn file -> File.rm(file) end)
|
||||
|
||||
# Write new one
|
||||
full_path = Path.join(@rfp_dir, filename)
|
||||
|
||||
case File.write(full_path, contents) do
|
||||
:ok ->
|
||||
get_rfp()
|
||||
|
||||
{:error, reason} ->
|
||||
{:ok, rfps} = get_rfp()
|
||||
{:error, reason, rfps}
|
||||
end
|
||||
end
|
||||
|
||||
def get_rfp() do
|
||||
File.ls(@rfp_dir)
|
||||
end
|
||||
|
||||
def delete_rfp(filename) do
|
||||
Logger.debug("Deleting RFP #{filename}...")
|
||||
|
||||
full_path = Path.join(@rfp_dir, filename)
|
||||
|
||||
case File.rm(full_path) do
|
||||
:ok ->
|
||||
get_rfp()
|
||||
|
||||
{:error, reason} ->
|
||||
{:ok, rfps} = get_rfp()
|
||||
{:error, reason, rfps}
|
||||
end
|
||||
end
|
||||
|
||||
def process_files() do
|
||||
# Only do work if RFP uploaded
|
||||
{:ok, rfp} = get_rfp()
|
||||
|
||||
if Enum.empty?(rfp) do
|
||||
{:error, :rfp_missing}
|
||||
else
|
||||
# Start work for all files
|
||||
# Reset results since new job started
|
||||
ResultsServer.reset()
|
||||
|
||||
res = Node.connect(@gpu_node)
|
||||
Logger.debug("GPU Node connection: #{inspect(res)}")
|
||||
|
||||
case get_files() do
|
||||
{:ok, files} ->
|
||||
Enum.each(files, fn filepath ->
|
||||
full_path = Path.join(@output_dir, filepath)
|
||||
spawn(fn -> FileProcessor.Impl.handle_file(full_path) end)
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.error("Failed to read files dir: #{reason}")
|
||||
end
|
||||
|
||||
# Start work for RFP
|
||||
case get_rfp() do
|
||||
{:ok, files} ->
|
||||
Enum.each(files, fn filepath ->
|
||||
full_path = Path.join(@rfp_dir, filepath)
|
||||
spawn(fn -> FileProcessor.Impl.handle_rfp(full_path) end)
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.error("Failed to read rfp file dir: #{reason}")
|
||||
end
|
||||
|
||||
{:ok}
|
||||
end
|
||||
end
|
||||
|
||||
def handle_file(filepath) do
|
||||
Logger.debug("Processing #{filepath}")
|
||||
|
||||
filetype = get_filetype(filepath)
|
||||
|
||||
text = extract_text(filetype, filepath)
|
||||
#Logger.debug("Got text from #{filepath}: #{inspect(text)}")
|
||||
|
||||
chunks =
|
||||
case text do
|
||||
{:error} ->
|
||||
# Send failure to results server
|
||||
[]
|
||||
|
||||
good_text ->
|
||||
chunks = TextChunker.split(good_text, chunk_size: 100, chunk_overlap: 10)
|
||||
#Logger.debug("Chunks: #{inspect(chunks)}")
|
||||
chunks
|
||||
end
|
||||
|
||||
all_chunks =
|
||||
Enum.map(chunks, fn chunk ->
|
||||
# Logger.debug("Chunk: #{chunk.text}")
|
||||
embedding = GenServer.call({BertEmbedding, @gpu_node}, {:embed, chunk.text}, :infinity)
|
||||
# Logger.debug("Embedding: #{inspect(embedding)}")
|
||||
%ChunkResult{chunk: chunk, embedding: embedding}
|
||||
end)
|
||||
|
||||
results = %FileResult{filename: Path.basename(filepath), chunks: all_chunks}
|
||||
#Logger.debug("Result done struct = #{inspect(results)}")
|
||||
ResultsServer.put_result(results)
|
||||
end
|
||||
|
||||
def handle_rfp(filepath) do
|
||||
Logger.debug("Processing RFP #{filepath}")
|
||||
end
|
||||
|
||||
defp get_filetype(filepath) do
|
||||
res = FileType.from_path(filepath)
|
||||
Logger.debug("Filetype = #{inspect(res)} for #{filepath}")
|
||||
|
||||
case res do
|
||||
{:ok, {type, _}} ->
|
||||
type
|
||||
|
||||
{:error, _} ->
|
||||
if String.ends_with?(filepath, "txt") do
|
||||
"txt"
|
||||
else
|
||||
{:error}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
#############################################
|
||||
# Extract text functions for each filetype
|
||||
#############################################
|
||||
|
||||
defp extract_text("txt", filepath) do
|
||||
Logger.debug("Extracting from text file: #{filepath}")
|
||||
|
||||
case File.read(filepath) do
|
||||
{:ok, content} ->
|
||||
content
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.error("Error reading file: #{reason}")
|
||||
{:error}
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_text("pdf", filepath) do
|
||||
Logger.debug("Extracting from pdf file: #{filepath}")
|
||||
|
||||
case PdfExtractor.extract_text_timeout(filepath, :infinity) do
|
||||
{:ok, res} ->
|
||||
values = Map.values(res)
|
||||
Enum.join(values, " ")
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.error("Error extracting pdf: #{filepath}, #{inspect(reason)}")
|
||||
{:error}
|
||||
|
||||
_ ->
|
||||
Logger.error("Error surprsing value from pdf extraction: #{filepath}")
|
||||
{:error}
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_text(unk, filepath) do
|
||||
Logger.error("Unknown filetype for file: #{filepath}, received: #{inspect(unk)}")
|
||||
{:error}
|
||||
end
|
||||
|
||||
#######################################
|
||||
end
|
||||
7
file_processor/lib/file_processor/structs.ex
Normal file
7
file_processor/lib/file_processor/structs.ex
Normal file
@@ -0,0 +1,7 @@
|
||||
defmodule FileResult do
|
||||
defstruct filename: "", chunks: []
|
||||
end
|
||||
|
||||
defmodule ChunkResult do
|
||||
defstruct chunk: %TextChunker.Chunk{}, embedding: %{}
|
||||
end
|
||||
89
file_processor/lib/results_server.ex
Normal file
89
file_processor/lib/results_server.ex
Normal file
@@ -0,0 +1,89 @@
|
||||
defmodule ResultsServer do
|
||||
require Logger
|
||||
use GenServer
|
||||
|
||||
@moduledoc """
|
||||
Genserver with data:
|
||||
%{results => %{filename->FileResult, filename2->FileResult},
|
||||
good => [filename1, filename2],
|
||||
bad => [badfiles]}
|
||||
"""
|
||||
|
||||
# Client
|
||||
#
|
||||
def start_link(initial_values) do
|
||||
GenServer.start_link(__MODULE__, initial_values, name: ResultsServer)
|
||||
end
|
||||
|
||||
@impl true
|
||||
def init(initial_values) do
|
||||
{:ok, initial_values}
|
||||
end
|
||||
|
||||
def put_result(file_result) do
|
||||
GenServer.cast(ResultsServer, {:put_result, file_result})
|
||||
end
|
||||
|
||||
def print_results() do
|
||||
GenServer.cast(ResultsServer, :print_results)
|
||||
end
|
||||
|
||||
def reset() do
|
||||
GenServer.cast(ResultsServer, :reset)
|
||||
end
|
||||
|
||||
# Server
|
||||
|
||||
@impl true
|
||||
def handle_cast({:put_result, file_result}, current_state) do
|
||||
|
||||
filename = file_result.filename
|
||||
chunks = file_result.chunks
|
||||
|
||||
Logger.debug("Got result: #{inspect(filename)}")
|
||||
|
||||
{new_state, status} =
|
||||
case chunks do
|
||||
[] ->
|
||||
{Map.put(current_state, :bad, current_state.bad ++ [filename]), :bad}
|
||||
_ ->
|
||||
new_state = Map.put(current_state, :good, current_state.good ++ [filename])
|
||||
results = new_state.results
|
||||
new_results = Map.put(results, filename, file_result)
|
||||
{Map.put(new_state, :results, new_results), :good}
|
||||
end
|
||||
|
||||
done_files = new_state.bad ++ new_state.good
|
||||
{:ok, all_files} = FileProcessor.get_files()
|
||||
|
||||
remaining = all_files -- done_files
|
||||
p = length(done_files)/length(all_files)
|
||||
Phoenix.PubSub.broadcast(TrustedEdgeServer.PubSub, "the_topic", {:new_result, {filename, status}})
|
||||
Phoenix.PubSub.broadcast(TrustedEdgeServer.PubSub, "the_topic", {:new_percent_done, p})
|
||||
Logger.info("Percent done: #{p}")
|
||||
Logger.info("Remaining: #{inspect(remaining)}")
|
||||
|
||||
{:noreply, new_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_cast(:print_results, current_state) do
|
||||
|
||||
good = current_state.good
|
||||
bad = current_state.bad
|
||||
results = current_state.results
|
||||
Logger.debug("Good: #{inspect(good)}")
|
||||
Logger.debug("Bad: #{inspect(bad)}")
|
||||
Enum.each(results, fn {key, value} ->
|
||||
Logger.debug("File: #{key}")
|
||||
Logger.debug("Chunks: #{inspect(value.chunks)}")
|
||||
end)
|
||||
{:noreply, current_state}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_cast(:reset, current_state) do
|
||||
{:noreply, %{:results=>%{}, :good=>[], :bad=>[]}}
|
||||
end
|
||||
|
||||
end
|
||||
31
file_processor/mix.exs
Normal file
31
file_processor/mix.exs
Normal file
@@ -0,0 +1,31 @@
|
||||
defmodule FileProcessor.MixProject do
|
||||
use Mix.Project
|
||||
|
||||
def project do
|
||||
[
|
||||
app: :file_processor,
|
||||
version: "0.1.0",
|
||||
elixir: "~> 1.18",
|
||||
start_permanent: Mix.env() == :prod,
|
||||
deps: deps()
|
||||
]
|
||||
end
|
||||
|
||||
# Run "mix help compile.app" to learn about applications.
|
||||
def application do
|
||||
[
|
||||
extra_applications: [:logger, :runtime_tools, :wx, :observer],
|
||||
mod: {FileProcessor.Application, []}
|
||||
]
|
||||
end
|
||||
|
||||
# Run "mix help deps" to learn about dependencies.
|
||||
defp deps do
|
||||
[
|
||||
{:file_type, "~> 0.1"},
|
||||
{:text_chunker, "~> 0.5.2"},
|
||||
{:pdf_extractor, "~> 0.5.0"},
|
||||
{:phoenix_pubsub, "~> 2.0"}
|
||||
]
|
||||
end
|
||||
end
|
||||
11
file_processor/mix.lock
Normal file
11
file_processor/mix.lock
Normal file
@@ -0,0 +1,11 @@
|
||||
%{
|
||||
"cc_precompiler": {:hex, :cc_precompiler, "0.1.11", "8c844d0b9fb98a3edea067f94f616b3f6b29b959b6b3bf25fee94ffe34364768", [:mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "3427232caf0835f94680e5bcf082408a70b48ad68a5f5c0b02a3bea9f3a075b9"},
|
||||
"elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"},
|
||||
"file_type": {:hex, :file_type, "0.1.0", "6197174cae5485468fc0cf0244200129e16748aba0f1ceee9b8792e4fef0f03f", [:mix], [], "hexpm", "650b8a933592b004e1c57ad03080e9ef074171764c44e36af14199ae9a408c88"},
|
||||
"fine": {:hex, :fine, "0.1.4", "b19a89c1476c7c57afb5f9314aed5960b5bc95d5277de4cb5ee8e1d1616ce379", [:mix], [], "hexpm", "be3324cc454a42d80951cf6023b9954e9ff27c6daa255483b3e8d608670303f5"},
|
||||
"nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
|
||||
"pdf_extractor": {:hex, :pdf_extractor, "0.5.0", "eaa9c11e936a34785aff5896ed3aca0d109bc21b02eac71e956a5478726a49de", [:mix], [{:pythonx, "~> 0.4.4", [hex: :pythonx, repo: "hexpm", optional: false]}], "hexpm", "1119734ce83ad974af2225e5b89e91ddad0876f3c29242df1e479276d12b7576"},
|
||||
"phoenix_pubsub": {:hex, :phoenix_pubsub, "2.2.0", "ff3a5616e1bed6804de7773b92cbccfc0b0f473faf1f63d7daf1206c7aeaaa6f", [:mix], [], "hexpm", "adc313a5bf7136039f63cfd9668fde73bba0765e0614cba80c06ac9460ff3e96"},
|
||||
"pythonx": {:hex, :pythonx, "0.4.7", "604a3a78377abdaa8739c561cb871c856b0e80d25fd057277839912017004af0", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.9", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:fine, "~> 0.1.2", [hex: :fine, repo: "hexpm", optional: false]}], "hexpm", "20d8b456df995e6ccd6d88dcf118ba80464194515f71a5c89aacdb824d235c52"},
|
||||
"text_chunker": {:hex, :text_chunker, "0.5.2", "39da4765846eabc222fde0f76e68111eee3b17bf826345085cd829dee9950402", [:mix], [{:nimble_options, "~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}], "hexpm", "d5c180a9deb4ad9a9864b6c0d8bbfaf2d26bede59887d3af23cbfae551928a95"},
|
||||
}
|
||||
1
file_processor/start.txt
Normal file
1
file_processor/start.txt
Normal file
@@ -0,0 +1 @@
|
||||
iex --sname fp -S mix
|
||||
8
file_processor/test/file_processor_test.exs
Normal file
8
file_processor/test/file_processor_test.exs
Normal file
@@ -0,0 +1,8 @@
|
||||
defmodule FileProcessorTest do
|
||||
use ExUnit.Case
|
||||
doctest FileProcessor
|
||||
|
||||
test "greets the world" do
|
||||
assert FileProcessor.hello() == :world
|
||||
end
|
||||
end
|
||||
1
file_processor/test/test_helper.exs
Normal file
1
file_processor/test/test_helper.exs
Normal file
@@ -0,0 +1 @@
|
||||
ExUnit.start()
|
||||
Reference in New Issue
Block a user