-
Notifications
You must be signed in to change notification settings - Fork 6
Add support for semantic search #955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,6 +66,8 @@ def record_id(id:, index:) | |
| argument :boolean_type, String, required: false, default_value: 'OR', | ||
| description: 'How to join multiword queries. Defaults to "OR" which means any ' \ | ||
| 'of the words much match. Options include: "OR", "AND"' | ||
| argument :query_mode, String, required: false, default_value: 'keyword', | ||
| description: 'Search mode, either "keyword" or "semantic"' | ||
|
|
||
| # applied filters | ||
| argument :access_to_files_filter, [String], | ||
|
|
@@ -103,11 +105,11 @@ def record_id(id:, index:) | |
| end | ||
|
|
||
| def search(searchterm:, citation:, contributors:, funding_information:, geodistance:, geobox:, identifiers:, | ||
| locations:, subjects:, title:, index:, source:, from:, boolean_type:, fulltext:, per_page: 20, **filters) | ||
| locations:, subjects:, title:, index:, source:, from:, boolean_type:, fulltext:, per_page: 20, query_mode: 'keyword', **filters) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| query = construct_query(searchterm, citation, contributors, funding_information, geodistance, geobox, identifiers, | ||
| locations, subjects, title, source, boolean_type, filters, per_page) | ||
| locations, subjects, title, source, boolean_type, filters, per_page, query_mode) | ||
|
|
||
| results = Opensearch.new.search(from, query, Timdex::OSClient, highlight_requested?, index, fulltext) | ||
| results = Opensearch.new.search(from, query, Timdex::OSClient, highlight: highlight_requested?, index: index, fulltext: fulltext, query_mode: query_mode) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| response = {} | ||
| response[:hits] = results['hits']['total']['value'] | ||
|
|
@@ -135,9 +137,10 @@ def inject_hits_fields_into_source(hits) | |
| end | ||
|
|
||
| def construct_query(searchterm, citation, contributors, funding_information, geodistance, geobox, identifiers, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| locations, subjects, title, source, boolean_type, filters, per_page) | ||
| locations, subjects, title, source, boolean_type, filters, per_page, query_mode = 'keyword') | ||
| query = {} | ||
| query[:q] = searchterm | ||
| query[:query_mode] = query_mode | ||
| query[:boolean_type] = boolean_type | ||
| query[:citation] = citation | ||
| query[:contributors] = contributors | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,13 @@ | ||
| # rubocop:disable Metrics/ClassLength | ||
| # rubocop:disable Metrics/MethodLength | ||
| class Opensearch | ||
| SIZE = 20 | ||
| MAX_SIZE = 200 | ||
|
|
||
| def search(from, params, client, highlight = false, index = nil, fulltext = false) | ||
| def search(from, params, client, highlight: false, index: nil, fulltext: false, query_mode: 'keyword') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| @params = params | ||
| @highlight = highlight | ||
| @fulltext = fulltext?(fulltext) | ||
| @query_mode = query_mode | ||
| index = default_index unless index.present? | ||
| client.search(index:, | ||
| body: build_query(from)) | ||
|
qltysh[bot] marked this conversation as resolved.
|
||
|
|
@@ -54,8 +54,14 @@ def build_query(from) | |
|
|
||
| # Build the query portion of the elasticsearch json | ||
| def query | ||
| @query_strategy ||= LexicalQueryBuilder.new | ||
| @query_strategy.build(@params, @fulltext) | ||
| builder = case @query_mode | ||
| when 'semantic' | ||
| SemanticQueryBuilder.new | ||
| else | ||
| LexicalQueryBuilder.new | ||
| end | ||
|
|
||
| builder.build(@params, fulltext: @fulltext) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, the pattern we put in place anticipating this is working as cleanly as we hoped. |
||
| end | ||
|
|
||
| def sort_builder | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| class SemanticQueryBuilder | ||
| def build(params, fulltext: false) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| query_text = params[:q].to_s.strip | ||
|
|
||
| # If no query text provided, return a match_all query (consistent with keyword search behavior) | ||
| return { match_all: {} } if query_text.blank? | ||
|
|
||
| lambda_response = invoke_semantic_builder(query_text) | ||
| parse_lambda_response(lambda_response) | ||
| end | ||
|
|
||
| private | ||
|
|
||
| def invoke_semantic_builder(query_text) | ||
| payload = { query: query_text } | ||
|
|
||
| response = Timdex::LambdaClient.invoke( | ||
| function_name: ENV.fetch('TIMDEX_SEMANTIC_BUILDER_FUNCTION_NAME'), | ||
| invocation_type: 'RequestResponse', | ||
| payload: payload.to_json | ||
| ) | ||
|
|
||
| parse_lambda_payload(response.payload) | ||
| rescue StandardError => e | ||
| raise "Semantic query builder Lambda error: #{e.message}" | ||
|
qltysh[bot] marked this conversation as resolved.
|
||
| end | ||
|
|
||
| def parse_lambda_payload(payload) | ||
| # AWS Lambda response payload can be an IO-like object (e.g., StringIO) or a string | ||
| payload_str = if payload.respond_to?(:read) | ||
| payload.read | ||
| else | ||
| payload.to_s | ||
| end | ||
| JSON.parse(payload_str) | ||
| rescue JSON::ParserError => e | ||
| raise "Invalid JSON response from semantic query builder: #{e.message}" | ||
| end | ||
|
|
||
| def parse_lambda_response(lambda_response) | ||
| # Lambda returns: { "query": { "bool": { "should": [...] } } } | ||
| # We extract and return just the inner query object | ||
| raise "Invalid semantic query builder response: missing 'query' key" unless lambda_response.key?('query') | ||
|
|
||
| query = lambda_response['query'] | ||
| raise 'Invalid semantic query builder response: query must be a Hash' unless query.is_a?(Hash) | ||
|
|
||
| query | ||
| end | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| require 'aws-sdk-lambda' | ||
|
|
||
| def configure_lambda_client | ||
| Aws::Lambda::Client.new( | ||
| region: ENV.fetch('AWS_REGION', 'us-east-1'), | ||
| access_key_id: ENV.fetch('AWS_ACCESS_KEY_ID'), | ||
| secret_access_key: ENV.fetch('AWS_SECRET_ACCESS_KEY') | ||
| ) | ||
| end | ||
|
|
||
| Timdex::LambdaClient = configure_lambda_client |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,20 +15,20 @@ def os_client | |
| def aws_os_client | ||
| OpenSearch::Client.new log: ENV.fetch('OPENSEARCH_LOG', false), url: ENV['OPENSEARCH_URL'] do |config| | ||
| # personal keys use expiring credentials with tokens | ||
| if ENV.fetch('AWS_OPENSEARCH_SESSION_TOKEN', false) | ||
| if ENV['AWS_SESSION_TOKEN'].present? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make sure to have all of the affect ENV duplicated in all tiers before this merges please. |
||
| config.request :aws_sigv4, | ||
| service: 'es', | ||
| region: ENV['AWS_REGION'], | ||
| access_key_id: ENV['AWS_OPENSEARCH_ACCESS_KEY_ID'], | ||
| secret_access_key: ENV['AWS_OPENSEARCH_SECRET_ACCESS_KEY'], | ||
| session_token: ENV['AWS_OPENSEARCH_SESSION_TOKEN'] | ||
| access_key_id: ENV['AWS_ACCESS_KEY_ID'], | ||
| secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'], | ||
| session_token: ENV['AWS_SESSION_TOKEN'] | ||
| # application keys don't use tokens | ||
| else | ||
| config.request :aws_sigv4, | ||
| service: 'es', | ||
| region: ENV['AWS_REGION'], | ||
| access_key_id: ENV['AWS_OPENSEARCH_ACCESS_KEY_ID'], | ||
| secret_access_key: ENV['AWS_OPENSEARCH_SECRET_ACCESS_KEY'] | ||
| access_key_id: ENV['AWS_ACCESS_KEY_ID'], | ||
| secret_access_key: ENV['AWS_SECRET_ACCESS_KEY'] | ||
| end | ||
| end | ||
| end | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found 2 issues:
1. Function with many parameters (count = 18): search [qlty:function-parameters]
2. Avoid parameter lists longer than 5 parameters. [18/5] [rubocop:Metrics/ParameterLists]