From 1c8a7b6a7c53a41977310da2e5de53e4811060b3 Mon Sep 17 00:00:00 2001 From: Pete Matsyburka Date: Tue, 28 Oct 2025 10:52:37 +0200 Subject: [PATCH] detect fields --- .gitignore | 1 + Dockerfile | 8 +- Gemfile | 2 + Gemfile.lock | 11 + app/controllers/templates_debug_controller.rb | 12 +- .../templates_detect_fields_controller.rb | 27 ++ app/javascript/application.js | 1 + app/javascript/template_builder/builder.vue | 6 + app/javascript/template_builder/fields.vue | 100 +++++- app/javascript/template_builder/i18n.js | 1 + app/views/templates/edit.html.erb | 2 +- config/routes.rb | 3 + lib/pdfium.rb | 160 +++++++++ lib/templates/detect_fields.rb | 264 ++++++++++++++ lib/templates/image_to_fields.rb | 331 ++++++++++++++++++ 15 files changed, 921 insertions(+), 8 deletions(-) create mode 100644 app/controllers/templates_detect_fields_controller.rb create mode 100755 lib/templates/detect_fields.rb create mode 100755 lib/templates/image_to_fields.rb diff --git a/.gitignore b/.gitignore index 5f01e718..d14f4595 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ yarn-debug.log* /docuseal /ee dump.rdb +*.onnx diff --git a/Dockerfile b/Dockerfile index 1e397cf0..b0be901f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,7 @@ RUN apk --no-cache add fontforge wget && \ wget https://cdn.jsdelivr.net/gh/notofonts/notofonts.github.io/fonts/NotoSansSymbols2/hinted/ttf/NotoSansSymbols2-Regular.ttf && \ wget https://github.com/Maxattax97/gnu-freefont/raw/master/ttf/FreeSans.ttf && \ wget https://github.com/impallari/DancingScript/raw/master/OFL.txt && \ + wget -O /model.onnx "https://github.com/docusealco/fields-detection/releases/download/1.0.0/model_704_int8.onnx" && \ wget -O pdfium-linux.tgz "https://github.com/docusealco/pdfium-binaries/releases/latest/download/pdfium-linux-$(uname -m | sed 's/x86_64/x64/;s/aarch64/arm64/').tgz" && \ mkdir -p /pdfium-linux && \ tar -xzf pdfium-linux.tgz -C /pdfium-linux @@ -50,7 +51,7 @@ ENV OPENSSL_CONF=/app/openssl_legacy.cnf WORKDIR /app -RUN echo '@edge https://dl-cdn.alpinelinux.org/alpine/edge/community' >> /etc/apk/repositories && apk add --no-cache sqlite-dev libpq-dev mariadb-dev vips-dev@edge yaml-dev redis libheif@edge vips-heif@edge gcompat ttf-freefont && mkdir /fonts && rm /usr/share/fonts/freefont/FreeSans.otf +RUN apk add --no-cache sqlite-dev libpq-dev mariadb-dev vips-dev yaml-dev redis libheif vips-heif gcompat ttf-freefont && mkdir /fonts && rm /usr/share/fonts/freefont/FreeSans.otf RUN echo $'.include = /etc/ssl/openssl.cnf\n\ \n\ @@ -66,7 +67,9 @@ activate = 1' >> /app/openssl_legacy.cnf COPY ./Gemfile ./Gemfile.lock ./ -RUN apk add --no-cache build-base && bundle install && apk del --no-cache build-base && rm -rf ~/.bundle /usr/local/bundle/cache && ruby -e "puts Dir['/usr/local/bundle/**/{spec,rdoc,resources/shared,resources/collation,resources/locales}']" | xargs rm -rf +RUN apk add --no-cache build-base && bundle install && apk del --no-cache build-base && rm -rf ~/.bundle /usr/local/bundle/cache && ruby -e "puts Dir['/usr/local/bundle/**/{spec,rdoc,resources/shared,resources/collation,resources/locales}']" | xargs rm -rf && ln -sf /usr/lib/libonnxruntime.so.1 $(ruby -e "print Dir[Gem::Specification.find_by_name('onnxruntime').gem_dir + '/vendor/*.so'].first") + +RUN echo 'https://dl-cdn.alpinelinux.org/alpine/edge/main' >> /etc/apk/repositories && echo 'https://dl-cdn.alpinelinux.org/alpine/edge/community' >> /etc/apk/repositories && apk add --no-cache onnxruntime COPY ./bin ./bin COPY ./app ./app @@ -83,6 +86,7 @@ COPY --from=download /fonts/GoNotoKurrent-Regular.ttf /fonts/GoNotoKurrent-Bold. COPY --from=download /fonts/FreeSans.ttf /usr/share/fonts/freefont COPY --from=download /pdfium-linux/lib/libpdfium.so /usr/lib/libpdfium.so COPY --from=download /pdfium-linux/licenses/pdfium.txt /usr/lib/libpdfium-LICENSE.txt +COPY --from=download /model.onnx /app/tmp/model.onnx COPY --from=webpack /app/public/packs ./public/packs RUN ln -s /fonts /app/public/fonts diff --git a/Gemfile b/Gemfile index 3a704a0d..b0974208 100644 --- a/Gemfile +++ b/Gemfile @@ -24,7 +24,9 @@ gem 'image_processing' gem 'jwt' gem 'lograge' gem 'mysql2', require: false +gem 'numo-narray' gem 'oj' +gem 'onnxruntime' gem 'pagy' gem 'pg', require: false gem 'premailer-rails' diff --git a/Gemfile.lock b/Gemfile.lock index 84d527e6..f8f8b1ee 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -357,9 +357,18 @@ GEM racc (~> 1.4) nokogiri (1.18.9-x86_64-linux-musl) racc (~> 1.4) + numo-narray (0.9.2.1) oj (3.16.11) bigdecimal (>= 3.0) ostruct (>= 0.2) + onnxruntime (0.10.1) + ffi + onnxruntime (0.10.1-aarch64-linux) + ffi + onnxruntime (0.10.1-arm64-darwin) + ffi + onnxruntime (0.10.1-x86_64-linux) + ffi openssl (3.3.0) orm_adapter (0.5.0) os (1.1.4) @@ -638,7 +647,9 @@ DEPENDENCIES letter_opener_web lograge mysql2 + numo-narray oj + onnxruntime pagy pg premailer-rails diff --git a/app/controllers/templates_debug_controller.rb b/app/controllers/templates_debug_controller.rb index 676c2f64..edaedff0 100644 --- a/app/controllers/templates_debug_controller.rb +++ b/app/controllers/templates_debug_controller.rb @@ -6,12 +6,18 @@ class TemplatesDebugController < ApplicationController DEBUG_FILE = '' def show - attachment = @template.documents.first + schema_uuids = @template.schema.index_by { |e| e['attachment_uuid'] } + attachment = @template.documents.find { |a| schema_uuids[a.uuid] } data = attachment.download - pdf = HexaPDF::Document.new(io: StringIO.new(data)) - fields = Templates::FindAcroFields.call(pdf, attachment, data) + unless attachment.image? + pdf = HexaPDF::Document.new(io: StringIO.new(data)) + + fields = Templates::FindAcroFields.call(pdf, attachment, data) + end + + fields = Templates::DetectFields.call(StringIO.new(data), attachment:) if fields.blank? attachment.metadata['pdf'] ||= {} attachment.metadata['pdf']['fields'] = fields diff --git a/app/controllers/templates_detect_fields_controller.rb b/app/controllers/templates_detect_fields_controller.rb new file mode 100644 index 00000000..8355dcb2 --- /dev/null +++ b/app/controllers/templates_detect_fields_controller.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +class TemplatesDetectFieldsController < ApplicationController + include ActionController::Live + + load_and_authorize_resource :template + + def create + response.headers['Content-Type'] = 'text/event-stream' + + sse = SSE.new(response.stream) + + documents = @template.schema_documents.preload(:blob) + + documents.each do |document| + io = StringIO.new(document.download) + + Templates::DetectFields.call(io, attachment: document) do |(attachment_uuid, page, fields)| + sse.write({ attachment_uuid:, page:, fields: }) + end + end + + sse.write({ completed: true }) + ensure + response.stream.close + end +end diff --git a/app/javascript/application.js b/app/javascript/application.js index 31e28407..9b17172b 100644 --- a/app/javascript/application.js +++ b/app/javascript/application.js @@ -156,6 +156,7 @@ safeRegisterElement('template-builder', class extends HTMLElement { withPhone: this.dataset.withPhone === 'true', withVerification: ['true', 'false'].includes(this.dataset.withVerification) ? this.dataset.withVerification === 'true' : null, withLogo: this.dataset.withLogo !== 'false', + withFieldsDetection: this.dataset.withFieldsDetection === 'true', editable: this.dataset.editable !== 'false', authenticityToken: document.querySelector('meta[name="csrf-token"]')?.content, withPayment: this.dataset.withPayment === 'true', diff --git a/app/javascript/template_builder/builder.vue b/app/javascript/template_builder/builder.vue index 07abcba9..bcf376ac 100644 --- a/app/javascript/template_builder/builder.vue +++ b/app/javascript/template_builder/builder.vue @@ -449,6 +449,7 @@ :default-required-fields="defaultRequiredFields" :field-types="fieldTypes" :with-sticky-submitters="withStickySubmitters" + :with-fields-detection="withFieldsDetection" :with-signature-id="withSignatureId" :with-prefillable="withPrefillable" :only-defined-fields="onlyDefinedFields" @@ -618,6 +619,11 @@ export default { required: false, default: true }, + withFieldsDetection: { + type: Boolean, + required: false, + default: false + }, withAddPageButton: { type: Boolean, required: false, diff --git a/app/javascript/template_builder/fields.vue b/app/javascript/template_builder/fields.vue index 922b98dd..7b789436 100644 --- a/app/javascript/template_builder/fields.vue +++ b/app/javascript/template_builder/fields.vue @@ -208,6 +208,34 @@ +
+ +
{ + return acc + doc.metadata?.pdf?.number_of_pages || doc.preview_images.length + }, 0) + }, isShowFieldSearch () { if (this.withFieldsSearch === false) { return false @@ -389,6 +430,61 @@ export default { this.$emit('set-drag', field) }, + detectFields () { + const fields = [] + + this.fieldPagesLoaded = 0 + + this.baseFetch(`/templates/${this.template.id}/detect_fields`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + } + }).then(async (response) => { + const reader = response.body.getReader() + const decoder = new TextDecoder('utf-8') + let buffer = '' + + while (true) { + const { value, done } = await reader.read() + + if (done) break + + buffer += decoder.decode(value, { stream: true }) + + const lines = buffer.split('\n\n') + + buffer = lines.pop() + + for (const line of lines) { + if (line.startsWith('data: ')) { + const jsonStr = line.replace(/^data: /, '') + const data = JSON.parse(jsonStr) + + if (data.completed) { + this.fieldPagesLoaded = null + this.template.fields = fields + + break + } else if (data.fields) { + data.fields.forEach((f) => { + f.submitter_uuid = this.template.submitters[0].uuid + }) + + this.fieldPagesLoaded += 1 + + fields.push(...data.fields) + } + } + } + } + }).catch(error => { + console.error('Error in streaming message: ', error) + }).finally(() => { + this.fieldPagesLoaded = null + this.isFieldsLoading = false + }) + }, setDragPlaceholder (event) { this.$emit('set-drag-placeholder', { offsetX: event.offsetX, diff --git a/app/javascript/template_builder/i18n.js b/app/javascript/template_builder/i18n.js index 75b450fe..a23e5c8b 100644 --- a/app/javascript/template_builder/i18n.js +++ b/app/javascript/template_builder/i18n.js @@ -1,5 +1,6 @@ const en = { view: 'View', + autodetect_fields: 'Autodetect fields', payment_link: 'Payment link', strikeout: 'Strikeout', draw_strikethrough_the_document: 'Draw strikethrough the document', diff --git a/app/views/templates/edit.html.erb b/app/views/templates/edit.html.erb index 79a10118..9fbc8d39 100644 --- a/app/views/templates/edit.html.erb +++ b/app/views/templates/edit.html.erb @@ -6,4 +6,4 @@ <%= button_to nil, user_configs_path, method: :post, params: { user_config: { key: UserConfig::SHOW_APP_TOUR, value: true } }, class: 'hidden', id: 'start_tour_button' %> <% end %> <% end %> - + diff --git a/config/routes.rb b/config/routes.rb index e90bd2f2..43701da1 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -100,6 +100,9 @@ Rails.application.routes.draw do resource :debug, only: %i[show], controller: 'templates_debug' if Rails.env.development? resources :documents, only: %i[create], controller: 'template_documents' resources :clone_and_replace, only: %i[create], controller: 'templates_clone_and_replace' + if !Docuseal.multitenant? || Docuseal.demo? + resources :detect_fields, only: %i[create], controller: 'templates_detect_fields' + end resources :restore, only: %i[create], controller: 'templates_restore' resources :archived, only: %i[index], controller: 'templates_archived_submissions' resources :submissions, only: %i[new create] diff --git a/lib/pdfium.rb b/lib/pdfium.rb index 317dddf0..464f95e2 100644 --- a/lib/pdfium.rb +++ b/lib/pdfium.rb @@ -25,6 +25,8 @@ class Pdfium typedef :pointer, :FPDF_BITMAP typedef :pointer, :FPDF_FORMHANDLE typedef :pointer, :FPDF_TEXTPAGE + typedef :pointer, :FPDF_PAGEOBJECT + typedef :pointer, :FPDF_PATHSEGMENT MAX_SIZE = 32_767 @@ -37,6 +39,9 @@ class Pdfium FPDF_RENDER_FORCEHALFTONE = 0x400 FPDF_PRINTING = 0x800 + TextNode = Struct.new(:content, :x, :y, :w, :h, keyword_init: true) + LineNode = Struct.new(:x, :y, :w, :h, :tilt, keyword_init: true) + # rubocop:disable Naming/ClassAndModuleCamelCase class FPDF_LIBRARY_CONFIG < FFI::Struct layout :version, :int, @@ -77,6 +82,37 @@ class Pdfium attach_function :FPDFText_ClosePage, [:FPDF_TEXTPAGE], :void attach_function :FPDFText_CountChars, [:FPDF_TEXTPAGE], :int attach_function :FPDFText_GetText, %i[FPDF_TEXTPAGE int int pointer], :int + attach_function :FPDFText_GetUnicode, %i[FPDF_TEXTPAGE int], :uint + attach_function :FPDFText_GetCharBox, %i[FPDF_TEXTPAGE int pointer pointer pointer pointer], :int + attach_function :FPDFText_GetCharOrigin, %i[FPDF_TEXTPAGE int pointer pointer], :int + attach_function :FPDFText_GetCharIndexAtPos, %i[FPDF_TEXTPAGE double double double double], :int + attach_function :FPDFText_CountRects, %i[FPDF_TEXTPAGE int int], :int + attach_function :FPDFText_GetRect, %i[FPDF_TEXTPAGE int pointer pointer pointer pointer], :int + attach_function :FPDFText_GetFontSize, %i[FPDF_TEXTPAGE int], :double + + # Page object functions for extracting paths/lines + attach_function :FPDFPage_CountObjects, [:FPDF_PAGE], :int + attach_function :FPDFPage_GetObject, %i[FPDF_PAGE int], :FPDF_PAGEOBJECT + attach_function :FPDFPageObj_GetType, [:FPDF_PAGEOBJECT], :int + attach_function :FPDFPageObj_GetBounds, %i[FPDF_PAGEOBJECT pointer pointer pointer pointer], :int + attach_function :FPDFPath_CountSegments, [:FPDF_PAGEOBJECT], :int + attach_function :FPDFPath_GetPathSegment, %i[FPDF_PAGEOBJECT int], :FPDF_PATHSEGMENT + attach_function :FPDFPathSegment_GetType, [:FPDF_PATHSEGMENT], :int + attach_function :FPDFPathSegment_GetPoint, %i[FPDF_PATHSEGMENT pointer pointer], :int + + # Page object types + FPDF_PAGEOBJ_UNKNOWN = 0 + FPDF_PAGEOBJ_TEXT = 1 + FPDF_PAGEOBJ_PATH = 2 + FPDF_PAGEOBJ_IMAGE = 3 + FPDF_PAGEOBJ_SHADING = 4 + FPDF_PAGEOBJ_FORM = 5 + + # Path segment types + FPDF_SEGMENT_UNKNOWN = -1 + FPDF_SEGMENT_LINETO = 0 + FPDF_SEGMENT_BEZIERTO = 1 + FPDF_SEGMENT_MOVETO = 2 typedef :int, :FPDF_BOOL typedef :pointer, :IPDF_JSPLATFORM @@ -157,6 +193,7 @@ class Pdfium raise PdfiumError, "#{context_message}: #{error_message(error_code)} (Code: #{error_code})" end + # rubocop:disable Metrics class Document attr_reader :document_ptr, :form_handle @@ -386,6 +423,128 @@ class Pdfium Pdfium.FPDFText_ClosePage(text_page) if text_page && !text_page.null? end + def text_nodes + return @text_nodes if @text_nodes + + text_page = Pdfium.FPDFText_LoadPage(page_ptr) + char_count = Pdfium.FPDFText_CountChars(text_page) + + @text_nodes = [] + + return @text_nodes if char_count.zero? + + char_count.times do |i| + unicode = Pdfium.FPDFText_GetUnicode(text_page, i) + + char = [unicode].pack('U*') + + left_ptr = FFI::MemoryPointer.new(:double) + right_ptr = FFI::MemoryPointer.new(:double) + bottom_ptr = FFI::MemoryPointer.new(:double) + top_ptr = FFI::MemoryPointer.new(:double) + + result = Pdfium.FPDFText_GetCharBox(text_page, i, left_ptr, right_ptr, bottom_ptr, top_ptr) + + next if result.zero? + + left = left_ptr.read_double + right = right_ptr.read_double + + origin_x_ptr = FFI::MemoryPointer.new(:double) + origin_y_ptr = FFI::MemoryPointer.new(:double) + + Pdfium.FPDFText_GetCharOrigin(text_page, i, origin_x_ptr, origin_y_ptr) + + origin_y = origin_y_ptr.read_double + + font_size = Pdfium.FPDFText_GetFontSize(text_page, i) + font_size = 8 if font_size == 1 + + abs_x = left + abs_y = height - origin_y - (font_size * 0.8) + abs_width = right - left + abs_height = font_size + + x = abs_x / width + y = abs_y / height + node_width = abs_width / width + node_height = abs_height / height + + @text_nodes << TextNode.new(content: char, x: x, y: y, w: node_width, h: node_height) + end + + @text_nodes = @text_nodes.sort { |a, b| a.y == b.y ? a.x <=> b.x : a.y <=> b.y } + ensure + Pdfium.FPDFText_ClosePage(text_page) if text_page && !text_page.null? + end + + def line_nodes + return @line_nodes if @line_nodes + + ensure_not_closed! + + @line_nodes = [] + + object_count = Pdfium.FPDFPage_CountObjects(page_ptr) + + return @line_nodes if object_count.zero? + + object_count.times do |i| + page_object = Pdfium.FPDFPage_GetObject(page_ptr, i) + + next if page_object.null? + + obj_type = Pdfium.FPDFPageObj_GetType(page_object) + + next unless obj_type == Pdfium::FPDF_PAGEOBJ_PATH + + left_ptr = FFI::MemoryPointer.new(:float) + bottom_ptr = FFI::MemoryPointer.new(:float) + right_ptr = FFI::MemoryPointer.new(:float) + top_ptr = FFI::MemoryPointer.new(:float) + + Pdfium.FPDFPageObj_GetBounds(page_object, left_ptr, bottom_ptr, right_ptr, top_ptr) + + obj_left = left_ptr.read_float + obj_bottom = bottom_ptr.read_float + obj_right = right_ptr.read_float + obj_top = top_ptr.read_float + + obj_width = obj_right - obj_left + obj_height = obj_top - obj_bottom + + next if obj_width < 1 && obj_height < 1 + + segment_count = Pdfium.FPDFPath_CountSegments(page_object) + + next if segment_count < 2 + + next unless segment_count <= 10 && (obj_height < 10 || obj_width < 10) + + if obj_width > obj_height && obj_height < 10 + tilt = 0 + elsif obj_height > obj_width && obj_width < 10 + tilt = 90 + else + next + end + + x = obj_left + y = obj_bottom + w = obj_width + h = obj_height + + norm_x = x / width + norm_y = (height - y - h) / height + norm_w = w / width + norm_h = h / height + + @line_nodes << LineNode.new(x: norm_x, y: norm_y, w: norm_w, h: norm_h, tilt: tilt) + end + + @line_nodes = @line_nodes.sort { |a, b| a.y == b.y ? a.x <=> b.x : a.y <=> b.y } + end + def close return if closed? @@ -445,4 +604,5 @@ class Pdfium at_exit do cleanup_library end + # rubocop:enable Metrics end diff --git a/lib/templates/detect_fields.rb b/lib/templates/detect_fields.rb new file mode 100755 index 00000000..20bb9e3f --- /dev/null +++ b/lib/templates/detect_fields.rb @@ -0,0 +1,264 @@ +# frozen_string_literal: true + +module Templates + module DetectFields + module_function + + TextFieldBox = Struct.new(:x, :y, :w, :h, keyword_init: true) + + # rubocop:disable Metrics + def call(io, attachment: nil, confidence: 0.3, temperature: 1, + nms: 0.1, split_page: false, aspect_ratio: true, padding: 20, &) + if attachment&.image? + process_image_attachment(io, attachment:, confidence:, nms:, split_page:, + temperature:, aspect_ratio:, padding:, &) + else + process_pdf_attachment(io, attachment:, confidence:, nms:, split_page:, + temperature:, aspect_ratio:, padding:, &) + end + end + + def process_image_attachment(io, attachment:, confidence:, nms:, temperature: 1, + split_page: false, aspect_ratio: false, padding: nil) + image = Vips::Image.new_from_buffer(io.read, '') + + fields = Templates::ImageToFields.call(image, confidence:, nms:, split_page:, + temperature:, aspect_ratio:, padding:) + + fields = fields.map do |f| + { + uuid: SecureRandom.uuid, + type: f.type, + required: true, + preferences: {}, + areas: [{ + x: f.x, + y: f.y, + w: f.w, + h: f.h, + page: 0, + attachment_uuid: attachment&.uuid + }] + } + end + + yield [attachment&.uuid, 0, fields] if block_given? + + fields + end + + def process_pdf_attachment(io, attachment:, confidence:, nms:, temperature: 1, + split_page: false, aspect_ratio: false, padding: nil) + doc = Pdfium::Document.open_bytes(io.read) + + doc.page_count.times.flat_map do |page_number| + page = doc.get_page(page_number) + + data, width, height = page.render_to_bitmap(width: ImageToFields::RESOLUTION * 1.5) + + image = Vips::Image.new_from_memory(data, width, height, 4, :uchar) + + fields = Templates::ImageToFields.call(image, confidence: 0.05, nms:, split_page:, + temperature:, aspect_ratio:, padding:) + + text_fields = extract_text_fields_from_page(page) + line_fields = extract_line_fields_from_page(page) + + fields = increase_confidence_for_overlapping_fields(fields, text_fields) + fields = increase_confidence_for_overlapping_fields(fields, line_fields) + + fields = fields.filter_map do |f| + next if f.confidence < confidence + + { + uuid: SecureRandom.uuid, + type: f.type, + required: true, + preferences: {}, + areas: [{ + x: f.x, y: f.y, + w: f.w, h: f.h, + page: page_number, + attachment_uuid: attachment&.uuid + }] + } + end + + yield [attachment&.uuid, page_number, fields] if block_given? + + fields + end + end + + def extract_line_fields_from_page(page) + line_thickness = 5.0 / page.height + + vertical_lines, all_horizontal_lines = page.line_nodes.partition { |line| line.tilt == 90 } + + horizontal_lines = all_horizontal_lines.reject do |h_line| + next true if h_line.w > 0.7 && (h_line.h < 0.1 || h_line.h < 0.9) + + next false if vertical_lines.blank? + + h_x_min = h_line.x + h_x_max = h_line.x + h_line.w + h_y_avg = h_line.y + (h_line.h / 2) + + vertical_lines.any? do |v_line| + v_x_avg = v_line.x + (v_line.w / 2) + v_y_min = v_line.y + v_y_max = v_line.y + v_line.h + + h_x_min_expanded = h_x_min - line_thickness + h_x_max_expanded = h_x_max + line_thickness + h_y_min_expanded = h_y_avg - line_thickness + h_y_max_expanded = h_y_avg + line_thickness + + v_x_min_expanded = v_x_avg - line_thickness + v_x_max_expanded = v_x_avg + line_thickness + v_y_min_expanded = v_y_min - line_thickness + v_y_max_expanded = v_y_max + line_thickness + + x_overlap = v_x_min_expanded <= h_x_max_expanded && v_x_max_expanded >= h_x_min_expanded + y_overlap = h_y_min_expanded <= v_y_max_expanded && h_y_max_expanded >= v_y_min_expanded + + x_overlap && y_overlap + end + end + + node_index = 0 + + horizontal_lines = horizontal_lines.reject do |line| + nodes = [] + + loop do + node = page.text_nodes[node_index += 1] + + break unless node + + break if node.y > line.y + + next if node.x + node.w < line.x || line.x + line.w < node.x || + node.y + node.h < line.y - node.h || line.y < node.y + + nodes << node + + next if nodes.blank? + + next_node = page.text_nodes[node_index + 1] + + break if next_node.x + next_node.w < line.x || line.x + line.w < next_node.x || + next_node.y + next_node.h < line.y - next_node.h || line.y < next_node.y + end + + next if nodes.blank? + + width = nodes.last.x + nodes.last.w - nodes.first.x + + next true if width > line.w / 2.0 + end + + horizontal_lines.each do |line| + line.h += 4 * line_thickness + line.y -= 4 * line_thickness + end + end + + def extract_text_fields_from_page(page) + text_nodes = page.text_nodes + + field_boxes = [] + + i = 0 + + while i < text_nodes.length + node = text_nodes[i] + + next i += 1 if node.content != '_' + + x1 = node.x + y1 = node.y + x2 = node.x + node.w + y2 = node.y + node.h + + underscore_count = 1 + + j = i + 1 + + while j < text_nodes.length + next_node = text_nodes[j] + + break unless next_node.content == '_' + + distance = next_node.x - x2 + height_diff = (next_node.y - y1).abs + + break if distance > 0.02 || height_diff > node.h * 0.5 + + underscore_count += 1 + next_x2 = next_node.x + next_node.w + next_y2 = next_node.y + next_node.h + + x2 = next_x2 + y2 = [y2, next_y2].max + y1 = [y1, next_node.y].min + + j += 1 + end + + field_boxes << TextFieldBox.new(x: x1, y: y1, w: x2 - x1, h: y2 - y1) if underscore_count >= 2 + + i = j + end + + field_boxes + end + + def calculate_iou(box1, box2) + x1 = [box1.x, box2.x].max + y1 = [box1.y, box2.y].max + x2 = [box1.x + box1.w, box2.x + box2.w].min + y2 = [box1.y + box1.h, box2.y + box2.h].min + + intersection_width = [0, x2 - x1].max + intersection_height = [0, y2 - y1].max + intersection_area = intersection_width * intersection_height + + return 0.0 if intersection_area.zero? + + box1_area = box1.w * box1.h + box2_area = box2.w * box2.h + union_area = box1_area + box2_area - intersection_area + + intersection_area / union_area + end + + def boxes_overlap?(box1, box2) + !(box1.x + box1.w < box2.x || box2.x + box2.w < box1.x || + box1.y + box1.h < box2.y || box2.y + box2.h < box1.y) + end + + def increase_confidence_for_overlapping_fields(image_fields, text_fields, by: 1.0) + return image_fields if text_fields.blank? + + image_fields.map do |image_field| + next if image_field.type != 'text' + + field_bottom = image_field.y + image_field.h + + text_fields.each do |text_field| + break if text_field.y > field_bottom + + next if text_field.y + text_field.h < image_field.y + + next unless boxes_overlap?(image_field, text_field) && calculate_iou(image_field, text_field) > 0.5 + + break image_field.confidence += by + end + end + + image_fields + end + # rubocop:enable Metrics + end +end diff --git a/lib/templates/image_to_fields.rb b/lib/templates/image_to_fields.rb new file mode 100755 index 00000000..ad747aff --- /dev/null +++ b/lib/templates/image_to_fields.rb @@ -0,0 +1,331 @@ +# frozen_string_literal: true + +module Templates + module ImageToFields + module_function + + Field = Struct.new(:type, :x, :y, :w, :h, :confidence, keyword_init: true) + + MODEL_PATH = Rails.root.join('tmp/model_704_int8.onnx') + + RESOLUTION = 704 + + ID_TO_CLASS = %w[text checkbox].freeze + + MEAN = [0.485, 0.456, 0.406].freeze + STD = [0.229, 0.224, 0.225].freeze + + CPU_THREADS = Etc.nprocessors + + # rubocop:disable Metrics + def call(image, confidence: 0.3, nms: 0.1, temperature: 1, + split_page: false, aspect_ratio: true, padding: nil) + base_image = image.extract_band(0, n: 3) + + trimmed_base, base_offset_x, base_offset_y = trim_image_with_padding(base_image, padding) + + if split_page && image.height > image.width + half_h = trimmed_base.height / 2 + top_h = half_h + bottom_h = trimmed_base.height - half_h + + regions = [ + { img: trimmed_base.crop(0, 0, trimmed_base.width, top_h), offset_y: 0 }, + { img: trimmed_base.crop(0, top_h, trimmed_base.width, bottom_h), offset_y: top_h } + ] + + detections = { xyxy: Numo::SFloat[], confidence: Numo::SFloat[], class_id: Numo::Int32[] } + + detections = regions.reduce(detections) do |acc, r| + next detections if r[:img].height <= 0 || r[:img].width <= 0 + + input_tensor, transform_info = preprocess_image(r[:img], RESOLUTION, aspect_ratio:) + + transform_info[:trim_offset_x] = base_offset_x + transform_info[:trim_offset_y] = base_offset_y + r[:offset_y] + + outputs = model.predict({ 'input' => input_tensor }) + + postprocess_outputs(outputs, transform_info, acc, confidence:, temperature:) + end + else + input_tensor, transform_info = preprocess_image(trimmed_base, RESOLUTION, aspect_ratio:) + + transform_info[:trim_offset_x] = base_offset_x + transform_info[:trim_offset_y] = base_offset_y + + outputs = model.predict({ 'input' => input_tensor }) + + detections = postprocess_outputs(outputs, transform_info, confidence:, temperature:) + end + + detections = apply_nms(detections, nms) + + fields = Array.new(detections[:xyxy].shape[0]) do |i| + x1 = detections[:xyxy][i, 0] + y1 = detections[:xyxy][i, 1] + x2 = detections[:xyxy][i, 2] + y2 = detections[:xyxy][i, 3] + + class_id = detections[:class_id][i].to_i + + confidence = detections[:confidence][i] + + x0_norm = x1 / image.width.to_f + y0_norm = y1 / image.height.to_f + x1_norm = x2 / image.width.to_f + y1_norm = y2 / image.height.to_f + + type_name = ID_TO_CLASS[class_id] + + Field.new( + type: type_name, + x: x0_norm, + y: y0_norm, + w: (x1_norm - x0_norm), + h: (y1_norm - y0_norm), + confidence: + ) + end + + sort_fields(fields, y_threshold: 10.0 / image.height) + end + + def trim_image_with_padding(image, padding = 0) + return [image, 0, 0] if padding.nil? + + left, top, trim_width, trim_height = image.find_trim(threshold: 10, background: [255, 255, 255]) + + padded_left = [left - padding, 0].max + padded_top = [top - padding, 0].max + padded_right = [left + trim_width + padding, image.width].min + padded_bottom = [top + trim_height + padding, image.height].min + + width = padded_right - padded_left + height = padded_bottom - padded_top + + trimmed_image = image.crop(padded_left, padded_top, width, height) + + [trimmed_image, padded_left, padded_top] + end + + def preprocess_image(image, resolution, aspect_ratio: false) + scale_x = resolution.to_f / image.width + scale_y = resolution.to_f / image.height + + if aspect_ratio + scale = [scale_x, scale_y].min + + new_width = (image.width * scale).round + new_height = (image.height * scale).round + + resized = image.resize(scale, vscale: scale, kernel: :lanczos3) + + pad_x = ((resolution - new_width) / 2.0).round + pad_y = ((resolution - new_height) / 2.0).round + + image = resized.embed(pad_x, pad_y, resolution, resolution, background: [255, 255, 255]) + + transform_info = { scale_x: scale, scale_y: scale, pad_x: pad_x, pad_y: pad_y } + else + image = image.resize(scale_x, vscale: scale_y, kernel: :lanczos3) + + transform_info = { scale_x: scale_x, scale_y: scale_y, pad_x: 0, pad_y: 0 } + end + + image /= 255.0 + + image = (image - MEAN) / STD + + pixel_data = image.write_to_memory + + img_array = Numo::SFloat.from_binary(pixel_data, [resolution, resolution, 3]) + + img_array = img_array.transpose(2, 0, 1) + + [img_array.reshape(1, 3, resolution, resolution), transform_info] + end + + def nms(boxes, scores, iou_threshold = 0.5) + return Numo::Int32[] if boxes.shape[0].zero? + + x1 = boxes[true, 0] + y1 = boxes[true, 1] + x2 = boxes[true, 2] + y2 = boxes[true, 3] + + areas = (x2 - x1) * (y2 - y1) + order = scores.sort_index.reverse + + keep = [] + + while order.size.positive? + i = order[0] + keep << i + + break if order.size == 1 + + xx1 = Numo::SFloat.maximum(x1[i], x1[order[1..]]) + yy1 = Numo::SFloat.maximum(y1[i], y1[order[1..]]) + xx2 = Numo::SFloat.minimum(x2[i], x2[order[1..]]) + yy2 = Numo::SFloat.minimum(y2[i], y2[order[1..]]) + + w = Numo::SFloat.maximum(0.0, xx2 - xx1) + h = Numo::SFloat.maximum(0.0, yy2 - yy1) + + intersection = w * h + + iou = intersection / (areas[i] + areas[order[1..]] - intersection) + + inds = iou.le(iou_threshold).where + + order = order[inds + 1] + end + + Numo::Int32.cast(keep) + end + + def postprocess_outputs(outputs, transform_info, detections = nil, confidence: 0.3, temperature: 1) + boxes = Numo::SFloat.cast(outputs['dets']) + logits = Numo::SFloat.cast(outputs['labels']) + + boxes = boxes[0, true, true] # [300, 4] + logits = logits[0, true, true] # [300, num_classes] + + scaled_logits = logits / temperature + + probs = 1.0 / (1.0 + Numo::NMath.exp(-scaled_logits)) + + scores = probs.max(axis: 1) + labels = probs.argmax(axis: 1) + + cx = boxes[true, 0] + cy = boxes[true, 1] + w = boxes[true, 2] + h = boxes[true, 3] + + x1 = cx - (w / 2.0) + y1 = cy - (h / 2.0) + x2 = cx + (w / 2.0) + y2 = cy + (h / 2.0) + + boxes_xyxy = Numo::SFloat.zeros(boxes.shape[0], 4) + boxes_xyxy[true, 0] = x1 + boxes_xyxy[true, 1] = y1 + boxes_xyxy[true, 2] = x2 + boxes_xyxy[true, 3] = y2 + + boxes_xyxy *= RESOLUTION + + pad_x = transform_info[:pad_x] + pad_y = transform_info[:pad_y] + boxes_xyxy[true, 0] -= pad_x + boxes_xyxy[true, 1] -= pad_y + boxes_xyxy[true, 2] -= pad_x + boxes_xyxy[true, 3] -= pad_y + + scale_x = transform_info[:scale_x] + scale_y = transform_info[:scale_y] + boxes_xyxy[true, 0] /= scale_x + boxes_xyxy[true, 1] /= scale_y + boxes_xyxy[true, 2] /= scale_x + boxes_xyxy[true, 3] /= scale_y + + trim_offset_x = transform_info[:trim_offset_x] + trim_offset_y = transform_info[:trim_offset_y] + boxes_xyxy[true, 0] += trim_offset_x + boxes_xyxy[true, 1] += trim_offset_y + boxes_xyxy[true, 2] += trim_offset_x + boxes_xyxy[true, 3] += trim_offset_y + + keep_mask = scores.gt(confidence) + + keep_indices = keep_mask.where + + if keep_indices.empty? + detections || { + xyxy: Numo::SFloat[], + confidence: Numo::SFloat[], + class_id: Numo::Int32[] + } + else + scores = scores[keep_indices] + labels = labels[keep_indices] + boxes_xyxy = boxes_xyxy[keep_indices, true] + + if detections + existing_n = detections[:xyxy].shape[0] + new_n = boxes_xyxy.shape[0] + total = existing_n + new_n + + xyxy = Numo::SFloat.zeros(total, 4) + conf = Numo::SFloat.zeros(total) + cls = Numo::Int32.zeros(total) + + if existing_n.positive? + xyxy[0...existing_n, true] = detections[:xyxy] + conf[0...existing_n] = detections[:confidence] + cls[0...existing_n] = detections[:class_id] + end + + xyxy[existing_n...(existing_n + new_n), true] = boxes_xyxy + conf[existing_n...(existing_n + new_n)] = scores + cls[existing_n...(existing_n + new_n)] = Numo::Int32.cast(labels) + + { xyxy: xyxy, confidence: conf, class_id: cls } + else + { + xyxy: boxes_xyxy, + confidence: scores, + class_id: Numo::Int32.cast(labels) + } + end + end + end + + def sort_fields(fields, y_threshold: 0.01) + sorted_fields = fields.sort { |a, b| a.y == b.y ? a.x <=> b.x : a.y <=> b.y } + + lines = [] + current_line = [] + + sorted_fields.each do |field| + if current_line.blank? || (field.y - current_line.first.y).abs < y_threshold + current_line << field + else + lines << current_line.sort_by(&:x) + + current_line = [field] + end + end + + lines << current_line.sort_by(&:x) if current_line.present? + + lines.flatten + end + + def apply_nms(detections, threshold = 0.5) + return detections if detections[:xyxy].shape[0].zero? + + keep_indices = nms(detections[:xyxy], detections[:confidence], threshold) + + { + xyxy: detections[:xyxy][keep_indices, true], + confidence: detections[:confidence][keep_indices], + class_id: detections[:class_id][keep_indices] + } + end + + def model + @model ||= OnnxRuntime::Model.new( + MODEL_PATH.to_s, + inter_op_num_threads: CPU_THREADS, + intra_op_num_threads: CPU_THREADS, + enable_mem_pattern: false, + enable_cpu_mem_arena: false, + providers: ['CPUExecutionProvider'] + ) + end + # rubocop:enable Metrics + end +end