Skip to content

Commit 2322c48

Browse files
committed
demo setup
0 parents  commit 2322c48

25 files changed

+12009
-0
lines changed

.env.example

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
DB_ON=true
2+
IS_PROD=true
3+
IS_SAVE_IMAGES=false
4+
IS_SAVE_HTML=false
5+
DC_PROXY_USERNAME=
6+
DC_PROXY_PASSWORD=
7+
MIN_UPDATE_TO_STOP=-1
8+
SUPPORTED_TYPES=homeAppliances,truck,trailer
9+
MAX_CONCURRENCY=1
10+
11+
SERVER_NAME=some
12+
ENV=LOCAL

.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
node_modules/
2+
data/
3+
.*
4+
!/.gitignore
5+
!/.dockerignore
6+
package-lock.json
7+
yarn-error.log
8+
output/
9+
eval.json
10+
token.json
11+
!.github
12+
!.env.example
13+
matching
14+
misc/

package.json

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
{
2+
"name": "scraper",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "index.js",
6+
"scripts": {
7+
"start": "nodemon src/main.ts --watch false --exec ts-node",
8+
"http-worker": "nodemon src/workers.ts --watch false --exec ts-node --type httpWorker",
9+
"headless-worker": "nodemon src/workers.ts --watch false --exec ts-node --type headless",
10+
"store-images": "nodemon src/workers.ts --watch false --exec ts-node --type imageStorer",
11+
"page-worker": "nodemon src/workers.ts --watch false --exec ts-node --type page",
12+
"mitm-worker": "nodemon src/workers.ts --watch false --exec ts-node --type mitmWorker",
13+
"event-consumer": "nodemon src/workers.ts --watch false --exec ts-node --type eventConsumer",
14+
"imsim-worker": "nodemon src/workers.ts --watch false --exec ts-node --type imsimWorker",
15+
"valuation-worker": "nodemon src/workers.ts --watch false --exec ts-node --type valuationWorker",
16+
"bull-board": "nodemon src/services/bull-board.ts --watch false --exec ts-node",
17+
"rc": "nodemon src/rc/rc.ts --watch false --inspect --exec ts-node",
18+
"server": "nodemon src/re-eval-tool/server.ts --watch false --inspect --exec ts-node",
19+
"manual-match-server": "nodemon src/manual-match-tool/server.ts --watch false --inspect --exec ts-node",
20+
"numai-server": "nodemon src/re-eval-tool/server.ts --watch false --inspect --exec ts-node --client numai",
21+
"cars-eval": "nodemon src/car-eval-tool/server.ts --watch false --inspect --exec ts-node",
22+
"debug": "nodemon src/testing.ts --watch false --inspect --exec ts-node --activity testing",
23+
"debug-matching": "nodemon src/product-matching/testing.ts --watch false --inspect --exec ts-node",
24+
"backup": "nodemon src/services/backup.ts --watch false --exec ts-node",
25+
"docker-prune": "nodemon src/services/docker-prune.ts --watch false --exec ts-node",
26+
"docker-restart": "nodemon src/services/docker-restart.ts --watch false --exec ts-node",
27+
"cleanup-bucket": "nodemon src/scripts/cleanup.ts --watch false --exec ts-node",
28+
"predictions": "nodemon src/ml/perform-prediction.ts --watch false --exec ts-node",
29+
"new-dash": "nodemon src/new-dashboard.ts --watch false --inspect --exec ts-node",
30+
"clear:babel-cache": "rimraf -rf ./node_modules/.cache/babel-loader/*",
31+
"test": "jest --verbose --forceExit",
32+
"report-test": "nodemon src/reports/aggregator-pitch-report/index.ts --watch false --inspect --exec ts-node"
33+
},
34+
"jest": {
35+
"setupFiles": [
36+
"./src/tests/setup.ts"
37+
]
38+
},
39+
"author": "",
40+
"license": "ISC",
41+
"devDependencies": {
42+
"@babel/plugin-proposal-decorators": "^7.18.10",
43+
"@babel/preset-env": "^7.18.10",
44+
"@babel/preset-typescript": "^7.18.6",
45+
"@types/country-list": "^2.1.0",
46+
"@types/express": "^4.17.13",
47+
"@types/ioredis": "^4.28.10",
48+
"@types/jest": "^27.4.0",
49+
"@types/lodash": "^4.14.184",
50+
"@types/node": "^17.0.8",
51+
"@types/numeral": "^2.0.2",
52+
"@types/ssh2": "^1.11.5",
53+
"jest": "^27.5.1"
54+
},
55+
"dependencies": {
56+
"@bull-board/api": "^5.0.0",
57+
"@bull-board/express": "^5.0.0",
58+
"@bull-monitor/express": "^5.1.0",
59+
"@elastic/enterprise-search": "^8.1.0-beta.1",
60+
"@extra/proxy-router": "^3.1.6",
61+
"@google-cloud/storage": "^5.8.4",
62+
"@google-cloud/vision": "^2.4.2",
63+
"@influxdata/influxdb-client": "^1.21.0",
64+
"@influxdata/influxdb-client-apis": "^1.21.0",
65+
"@sentry/node": "^7.44.2",
66+
"@slack/socket-mode": "^1.3.1",
67+
"@slack/web-api": "^6.7.2",
68+
"axios": "^0.26.0",
69+
"axios-retry": "^3.3.1",
70+
"bullmq": "^3.10.1",
71+
"cheerio": "^1.0.0-rc.12",
72+
"colorette": "^2.0.19",
73+
"connect-ensure-login": "^0.1.1",
74+
"country-list": "^2.1.1",
75+
"country-regex": "^1.1.0",
76+
"cron": "^2.0.0",
77+
"cron-parser": "^4.2.1",
78+
"csvtojson": "^2.0.10",
79+
"dd-trace": "^3.5.0",
80+
"dotenv": "^16.0.0",
81+
"express-session": "^1.17.3",
82+
"file-type": "16.5.3",
83+
"filereader": "^0.10.3",
84+
"form-data": "^4.0.0",
85+
"fuzzy": "^0.1.3",
86+
"geojson-geometries-lookup": "^0.5.0",
87+
"googleapis": "^95.0.0",
88+
"got-scraping": "^3.2.8",
89+
"he": "^1.2.0",
90+
"html2plaintext": "^2.0.1",
91+
"https-proxy-agent": "^5.0.0",
92+
"image-size": "^1.0.2",
93+
"is-equal": "^1.6.3",
94+
"is-image-url": "^1.1.8",
95+
"jimp": "^0.9.8",
96+
"lodash": "^4.17.21",
97+
"mailersend": "^2.0.5",
98+
"moment": "^2.22.2",
99+
"moment-timezone": "^0.5.34",
100+
"mysql2": "^2.3.3",
101+
"mysqldump": "^3.2.0",
102+
"node-fetch": "^2.6.1",
103+
"node-xlsx": "^0.17.1",
104+
"nodemon": "^2.0.19",
105+
"numeral": "^2.0.6",
106+
"objects-to-csv": "^1.3.6",
107+
"passport": "^0.6.0",
108+
"passport-local": "^1.0.0",
109+
"pino": "^8.1.0",
110+
"pino-pretty": "^8.1.0",
111+
"pixelmatch": "^5.3.0",
112+
"puppeteer": "^19.7.4",
113+
"puppeteer-cluster": "^0.23.0",
114+
"puppeteer-extra": "^3.3.6",
115+
"puppeteer-extra-plugin-adblocker": "^2.13.6",
116+
"puppeteer-extra-plugin-stealth": "^2.11.2",
117+
"query-string": "^6.13.7",
118+
"request": "^2.88.0",
119+
"request-promise": "^4.2.2",
120+
"seedrandom": "^3.0.5",
121+
"sharp": "^0.31.1",
122+
"ssh-exec": "^2.0.0",
123+
"ssh2": "^1.10.0",
124+
"stream": "^0.0.2",
125+
"tough-cookie": "^4.0.0",
126+
"ts-node": "^10.9.1",
127+
"typescript": "^4.8.2",
128+
"unleash-client": "^3.16.1",
129+
"uuid": "^8.3.2",
130+
"xml-js": "^1.6.11",
131+
"yargs": "^17.0.1"
132+
}
133+
}

src/common/aggregator-common.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import { checkColumnsIfModified } from ".";
2+
import { formatPrice } from "../numerals";
3+
import { AggregatorItem, AggregatorSourceItem } from "../types/items/aggregatorItem";
4+
5+
const columnsNeedToCheckForAggregator: Array<keyof AggregatorItem> = [
6+
"brand",
7+
"model",
8+
"declaredUpdated",
9+
"declaredProductCode"
10+
]
11+
12+
13+
function isChangedAggregator(processedItem: AggregatorItem, dbItem: AggregatorItem): string {
14+
15+
let modifiedFields: string[] = checkColumnsIfModified(columnsNeedToCheckForAggregator, processedItem, dbItem)
16+
17+
if (processedItem.countryCode && processedItem.countryCode !== dbItem.countryCode) {
18+
throw `isChangedAggregator got different country codes! ID=${dbItem.id}: ${dbItem.countryCode} vs ${processedItem.countryCode}`
19+
}
20+
21+
return modifiedFields.join(',')
22+
}
23+
24+
25+
const columnsNeedToCheckForAggregatorSource: Array<keyof AggregatorSourceItem> = [
26+
"price",
27+
"inStock",
28+
"priceType",
29+
"title",
30+
"url"
31+
]
32+
33+
function isChangedAggregatorSource(processedItem: AggregatorSourceItem, dbItem: AggregatorSourceItem): string {
34+
let modifiedFields: string[] = checkColumnsIfModified(columnsNeedToCheckForAggregatorSource, processedItem, dbItem)
35+
36+
if (processedItem.countryCode && processedItem.countryCode !== dbItem.countryCode) {
37+
throw `isChangedAggregatorSource got different country codes! ID=${dbItem.id}: ${dbItem.countryCode} vs ${processedItem.countryCode}`
38+
}
39+
40+
return modifiedFields.join(',')
41+
}
42+
43+
export {
44+
isChangedAggregator,
45+
isChangedAggregatorSource
46+
}

src/common/brands.ts

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import { Job } from "bullmq"
2+
import { countryCodes, dbServers, EngineType } from "../config/enums"
3+
import { ContextType } from "../libs/logger"
4+
import { jsonOrStringForDb, jsonOrStringToJson, stringOrNullForDb, stringToHash } from "../utils"
5+
import _ from "lodash"
6+
import { sources } from "../sites/sources"
7+
import items from "./../../pharmacyItems.json"
8+
import connections from "./../../brandConnections.json"
9+
10+
type BrandsMapping = {
11+
[key: string]: string[]
12+
}
13+
14+
export async function getBrandsMapping(): Promise<BrandsMapping> {
15+
// const query = `
16+
// SELECT
17+
// LOWER(p1.manufacturer) manufacturer_p1
18+
// , LOWER(GROUP_CONCAT(DISTINCT p2.manufacturer ORDER BY p2.manufacturer SEPARATOR ';')) AS manufacturers_p2
19+
// FROM
20+
// property_matchingvalidation v
21+
// INNER JOIN
22+
// property_pharmacy p1 ON v.m_source = p1.source
23+
// AND v.m_source_id = p1.source_id
24+
// AND v.m_country_code = p1.country_code
25+
// AND p1.newest = true
26+
// INNER JOIN
27+
// property_pharmacy p2 ON v.c_source = p2.source
28+
// AND v.c_source_id = p2.source_id
29+
// AND v.c_country_code = p2.country_code
30+
// AND p2.newest = true
31+
// WHERE
32+
// v.m_source = 'AZT'
33+
// AND v.engine_type = '${EngineType.Barcode}'
34+
// and p1.manufacturer is not null
35+
// and p2.manufacturer is not null
36+
// and p1.manufacturer not in ('kita', 'nera', 'cits')
37+
// and p2.manufacturer not in ('kita', 'nera', 'cits')
38+
// GROUP BY
39+
// p1.manufacturer
40+
// `
41+
// const brandConnections = await executeQueryAndGetResponse(dbServers.pharmacy, query)
42+
// For this test day purposes exported the necessary object
43+
const brandConnections = connections
44+
45+
const getRelatedBrands = (map: Map<string, Set<string>>, brand: string): Set<string> => {
46+
const relatedBrands = new Set<string>()
47+
const queue = [brand]
48+
while (queue.length > 0) {
49+
const current = queue.pop()!
50+
if (map.has(current)) {
51+
const brands = map.get(current)!
52+
for (const b of brands) {
53+
if (!relatedBrands.has(b)) {
54+
relatedBrands.add(b)
55+
queue.push(b)
56+
}
57+
}
58+
}
59+
}
60+
return relatedBrands
61+
}
62+
63+
// Create a map to track brand relationships
64+
const brandMap = new Map<string, Set<string>>()
65+
66+
brandConnections.forEach(({ manufacturer_p1, manufacturers_p2 }) => {
67+
const brand1 = manufacturer_p1.toLowerCase()
68+
const brands2 = manufacturers_p2.toLowerCase()
69+
const brand2Array = brands2.split(";").map((b) => b.trim())
70+
if (!brandMap.has(brand1)) {
71+
brandMap.set(brand1, new Set())
72+
}
73+
brand2Array.forEach((brand2) => {
74+
if (!brandMap.has(brand2)) {
75+
brandMap.set(brand2, new Set())
76+
}
77+
brandMap.get(brand1)!.add(brand2)
78+
brandMap.get(brand2)!.add(brand1)
79+
})
80+
})
81+
82+
// Build the final flat map
83+
const flatMap = new Map<string, Set<string>>()
84+
85+
brandMap.forEach((_, brand) => {
86+
const relatedBrands = getRelatedBrands(brandMap, brand)
87+
flatMap.set(brand, relatedBrands)
88+
})
89+
90+
// Convert the flat map to an object for easier usage
91+
const flatMapObject: Record<string, string[]> = {}
92+
93+
flatMap.forEach((relatedBrands, brand) => {
94+
flatMapObject[brand] = Array.from(relatedBrands)
95+
})
96+
97+
return flatMapObject
98+
}
99+
100+
async function getPharmacyItems(countryCode: countryCodes, source: sources, versionKey: string, mustExist = true) {
101+
// let query = `
102+
// SELECT
103+
// p.url, p.removed_timestamp, p.title, p.source_id
104+
// , p.manufacturer
105+
// , map.source_id m_id
106+
// , map.source
107+
// , map.country_code
108+
// , map.meta
109+
// FROM
110+
// property_pharmacy p
111+
// left join pharmacy_mapping map on p.source_id = map.source_id and p.source = map.source and p.country_code = map.country_code
112+
// WHERE
113+
// p.newest = TRUE
114+
// and p.country_code = '${countryCode}'
115+
// and p.source = '${source}'
116+
// and p.removed_timestamp is null
117+
// and (p.manufacturer is null or p.manufacturer in ('nera', 'kita', 'cits'))
118+
// ORDER BY p.removed_timestamp IS NULL DESC, p.removed_timestamp DESC
119+
// `
120+
// let products = await executeQueryAndGetResponse(dbServers.pharmacy, query)
121+
// for (let product of products) {
122+
// product.meta = jsonOrStringToJson(product.meta)
123+
// }
124+
125+
// let finalProducts = products.filter((product) => (!mustExist || product.m_id) && !product.meta[versionKey])
126+
const finalProducts = items
127+
128+
return finalProducts
129+
}
130+
131+
export function checkBrandIsSeparateTerm(input: string, brand: string): boolean {
132+
// Escape any special characters in the brand name for use in a regular expression
133+
const escapedBrand = brand.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
134+
135+
// Check if the brand is at the beginning or end of the string
136+
const atBeginningOrEnd = new RegExp(
137+
`^(?:${escapedBrand}\\s|.*\\s${escapedBrand}\\s.*|.*\\s${escapedBrand})$`,
138+
"i"
139+
).test(input)
140+
141+
// Check if the brand is a separate term in the string
142+
const separateTerm = new RegExp(`\\b${escapedBrand}\\b`, "i").test(input)
143+
144+
// The brand should be at the beginning, end, or a separate term
145+
return atBeginningOrEnd || separateTerm
146+
}
147+
148+
export async function assignBrandIfKnown(countryCode: countryCodes, source: sources, job?: Job) {
149+
const context = { scope: "assignBrandIfKnown" } as ContextType
150+
151+
const brandsMapping = await getBrandsMapping()
152+
153+
const versionKey = "assignBrandIfKnown"
154+
let products = await getPharmacyItems(countryCode, source, versionKey, false)
155+
let counter = 0
156+
for (let product of products) {
157+
counter++
158+
159+
if (product.m_id) {
160+
// Already exists in the mapping table, probably no need to update
161+
continue
162+
}
163+
164+
let matchedBrands = []
165+
for (const brandKey in brandsMapping) {
166+
const relatedBrands = brandsMapping[brandKey]
167+
for (const brand of relatedBrands) {
168+
if (matchedBrands.includes(brand)) {
169+
continue
170+
}
171+
const isBrandMatch = checkBrandIsSeparateTerm(product.title, brand)
172+
if (isBrandMatch) {
173+
matchedBrands.push(brand)
174+
}
175+
}
176+
}
177+
console.log(`${product.title} -> ${_.uniq(matchedBrands)}`)
178+
const sourceId = product.source_id
179+
const meta = { matchedBrands }
180+
const brand = matchedBrands.length ? matchedBrands[0] : null
181+
182+
const key = `${source}_${countryCode}_${sourceId}`
183+
const uuid = stringToHash(key)
184+
185+
// Then brand is inserted into product mapping table
186+
}
187+
}

0 commit comments

Comments
 (0)