Skip to content

Commit aa7d993

Browse files
authored
Allow parsers to define parameters for URL normalization (#451)
* Allow parsers to define parameters for URL normalization Allow parsers to define URL parameters for normalization. The provided parameters will be stripped from the URL.
1 parent 09bcc8c commit aa7d993

File tree

5 files changed

+68
-0
lines changed

5 files changed

+68
-0
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,16 @@ To enable sampling for Honeycomb, set the following configuration (either in `co
454454

455455
**Note**: If sampling behavior is changed in Pender, we will also need to update the behavior to match in any other application reporting to Honeycomb. More [here](https://docs.honeycomb.io/getting-data-in/opentelemetry/ruby/#sampling)
456456

457+
### URL Parameters Normalization
458+
459+
Some service providers include URL parameters for tracking purposes that can be safely removed. Pender parsers can define a list of such parameters to be removed during the URL normalization process.
460+
461+
To define URL parameters to be removed, a parser class should implement the `urls_parameters_to_remove` method, which returns an array of strings representing the parameters to be stripped. For example:
462+
463+
```ruby
464+
def urls_parameters_to_remove
465+
['ighs']
466+
end
457467

458468
#### Environment overrides
459469

app/models/media.rb

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def initialize(attributes = {})
6464
self.follow_redirections
6565
self.url = RequestHelper.normalize_url(self.url) unless self.get_canonical_url
6666
self.try_https
67+
self.remove_parser_specific_parameters
6768
self.parser = nil
6869
end
6970

@@ -275,6 +276,37 @@ def try_https
275276
end
276277
end
277278

279+
def remove_parser_specific_parameters
280+
parser_class = self.class.find_parser_class(self.url)
281+
return unless parser_class&.respond_to?(:urls_parameters_to_remove)
282+
283+
params_to_remove = parser_class.urls_parameters_to_remove
284+
return unless params_to_remove.any? { |param| self.url.include?(param) }
285+
286+
uri = URI.parse(self.url)
287+
query_params = URI.decode_www_form(uri.query || '').to_h
288+
289+
params_to_remove.each do |param|
290+
query_params.keys.each do |key|
291+
query_params.delete(key) if key == param
292+
end
293+
end
294+
295+
new_query = query_params.empty? ? nil : URI.encode_www_form(query_params)
296+
uri.query = new_query
297+
298+
result_url = uri.to_s
299+
result_url += '/' if url.end_with?('/') && !result_url.end_with?('/')
300+
self.url = result_url
301+
end
302+
303+
def self.find_parser_class(url)
304+
PARSERS.each do |parser|
305+
return parser if parser.patterns.any? { |pattern| pattern.match?(url) }
306+
end
307+
nil
308+
end
309+
278310
def get_html(header_options = {}, force_proxy = false)
279311
RequestHelper.get_html(self.url, self.method(:set_error), header_options, force_proxy)
280312
end

app/models/parser/instagram_item.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ def type
1313
def patterns
1414
[INSTAGRAM_ITEM_URL, INSTAGRAM_HOME_URL]
1515
end
16+
17+
def urls_parameters_to_remove
18+
['igsh']
19+
end
1620
end
1721

1822
private

app/models/parser/instagram_profile.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ def type
1212
def patterns
1313
[INSTAGRAM_PROFILE_URL]
1414
end
15+
16+
def urls_parameters_to_remove
17+
['igsh']
18+
end
1519
end
1620

1721
private

test/models/media_test.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,4 +618,22 @@ def teardown
618618
assert_equal "201", response.code
619619
assert_equal 'fake response body', response.body
620620
end
621+
622+
test 'should remove parser specific URL parameters' do
623+
url = 'https://www.instagram.com/p/xyz/?igsh=1'
624+
WebMock.stub_request(:any, url).to_return(status: 200, body: 'fake response body')
625+
626+
media = Media.new(url: url)
627+
assert_not_includes media.url, 'igsh'
628+
assert_equal media.url, 'https://www.instagram.com/p/xyz'
629+
end
630+
631+
test 'should remove parser specific URL parameters when URL contains multiple parameters' do
632+
url = 'https://www.instagram.com/p/xyz/?param1=value1&igsh=1&param2=value2'
633+
WebMock.stub_request(:any, url).to_return(status: 200, body: 'fake response body')
634+
635+
media = Media.new(url: url)
636+
assert_not_includes media.url, 'igsh'
637+
assert_equal media.url, 'https://www.instagram.com/p/xyz?param1=value1&param2=value2'
638+
end
621639
end

0 commit comments

Comments
 (0)