Skip to content

Commit 19bcc63

Browse files
Merge pull request #15 from Kanaries/feat_subspace_new_alg
Feat: subspace new algorithm
2 parents 32d11f4 + 4cdaac3 commit 19bcc63

File tree

12 files changed

+203
-57
lines changed

12 files changed

+203
-57
lines changed

packages/frontend/src/workers/cluster.worker.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* eslint no-restricted-globals: 0 */
22
import { kruskalMST } from 'visual-insights';
3-
3+
const PearsonThreshold = 0.5;
44
function sum (arr) {
55
let ans = 0;
66
let len = arr.length;
@@ -15,7 +15,7 @@ const cluster = (e) => {
1515
const { spaces, maxGroupNumber } = e.data;
1616
let result = [];
1717
for (let space of spaces) {
18-
const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber);
18+
const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber, PearsonThreshold);
1919
let measureGroups = new Map();
2020
for (let i = 0; i < groups.length; i++) {
2121
if (!measureGroups.has(groups[i])) {

packages/visual-insights/src/dashboard/index.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import { FieldsFeature, correlation, linearMapPositive } from "../insights/impurity";
22
import { DataSource, OperatorType } from "../commonTypes";
3-
import cluster, { kruskalMST } from "../insights/cluster";
3+
import cluster from "../insights/cluster";
44
import aggregate from 'cube-core';
55
import { normalize, entropy } from "../impurityMeasure";
66
import { crammersV } from './utils';
7+
import { CrammersVThreshold, PearsonCorrelation } from '../insights/config';
78

89
interface DashBoardSpace {
910
dimensions: string[];
@@ -48,7 +49,8 @@ export function getDashBoardSubspace (dataSource: DataSource, dimensions: string
4849
const measureGroups = cluster({
4950
matrix: correlationMatrix,
5051
measures,
51-
groupMaxSize: Math.round(measures.length / 6) // todo: make a config: max 6 measures in a dashboard
52+
groupMaxSize: Math.round(measures.length / 6), // todo: make a config: max 6 measures in a dashboard
53+
threshold: PearsonCorrelation.weak
5254
})
5355

5456
const dimCorrelationMatrix = dimensions.map(d => dimensions.map(d => 0));
@@ -119,7 +121,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da
119121
const measureGroups = cluster({
120122
matrix: dashBoardSpace.correlationMatrix,
121123
measures: measures,
122-
groupMaxSize: Math.round(measures.length / 3) // todo: make a config: max 3 measures in a chart
124+
groupMaxSize: Math.round(measures.length / 3), // todo: make a config: max 3 measures in a chart
125+
threshold: PearsonCorrelation.strong
123126
});
124127
for (let group of measureGroups) {
125128
const meaIndexList = group.map(mea => measures.indexOf(mea))
@@ -145,7 +148,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da
145148
matrix: dimensionCorrelationMatrix,
146149
measures: dimensions,
147150
groupMaxSize: 2, // todo: make a config: max 2 dimensions in a chart
148-
limitSize: true
151+
limitSize: true,
152+
threshold: CrammersVThreshold
149153
})
150154

151155
const dimGroupEntropyMatrix = getEntropyMatrix(dimensionGroups, measures, dataSource);
@@ -173,4 +177,6 @@ function minIndex(arr: number[]) {
173177
}
174178
}
175179
return pos;
176-
}
180+
}
181+
182+
export { crammersV }
Lines changed: 96 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,109 @@
11
import { DataSource } from "../commonTypes";
2-
3-
export function chiSquared(matrix: number[][] = [[]]): number {
4-
let rowSums = matrix.map(m => 0);
5-
let colSums = matrix[0].map(m => 0);
2+
type nestTree = Map<string, Map<string, number>>;
3+
export function chiSquared(nestTree: nestTree, xSet: Set<string>, ySet: Set<string>): number {
4+
if (typeof nestTree === 'undefined' || typeof xSet === 'undefined' || typeof ySet === 'undefined') {
5+
return 0;
6+
}
7+
let rowSums = new Map<string, number>();
8+
let colSums = new Map<string, number>();
69
let totalSum = 0;
7-
for (let i = 0; i < matrix.length; i++) {
8-
for (let j = 0; j < matrix[i].length; j++) {
9-
rowSums[i] += matrix[i][j];
10-
colSums[j] += matrix[i][j];
11-
totalSum += matrix[i][j];
10+
for (let x of xSet) {
11+
rowSums.set(x, 0);
12+
}
13+
for (let y of ySet) {
14+
colSums.set(y, 0);
15+
}
16+
for (let [x, node] of nestTree) {
17+
for (let [y, counter] of node) {
18+
rowSums.set(x, rowSums.get(x) + counter);
19+
colSums.set(y, colSums.get(y) + counter);
20+
totalSum += counter;
1221
}
1322
}
23+
1424
let chis = 0;
15-
for (let i = 0; i < matrix.length; i++) {
16-
for (let j = 0; j < matrix[i].length; j++) {
17-
let observed = matrix[i][j];
18-
let expected = rowSums[i] * colSums[j] / totalSum;
25+
for (let [x, node] of nestTree) {
26+
for (let [y, observed] of node) {
27+
let expected = rowSums.get(x) * colSums.get(y) / totalSum;
1928
chis += (observed - expected) ** 2 / expected;
2029
}
2130
}
2231
return chis;
2332
}
2433

25-
export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string) {
26-
const xSet = new Set(dataSource.map(d => d[fieldX]))
27-
const ySet = new Set(dataSource.map(d => d[fieldY]))
28-
const xMembers = [...xSet];
29-
const yMembers = [...ySet];
30-
let xDict = {};
31-
let yDict = {};
32-
for (let i = 0; i < xMembers.length; i++) {
33-
xDict[xMembers[i]] = i;
34-
}
35-
for (let i = 0; i < yMembers.length; i++) {
36-
yDict[yMembers[i]] = i;
37-
}
38-
let matrix: number[][] = xMembers.map(x => yMembers.map(y => 0));
39-
for (let record of dataSource) {
40-
matrix[xDict[record[fieldX]]][yDict[record[fieldY]]]++;
34+
35+
export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string): number {
36+
const xSet = new Set<string>()
37+
const ySet = new Set<string>()
38+
const nestTree = new Map<string, Map<string, number>>();
39+
let len = dataSource.length;
40+
for (let i = 0; i < len; i++) {
41+
let record = dataSource[i];
42+
xSet.add(record[fieldX])
43+
ySet.add(record[fieldY]);
44+
if (!nestTree.has(record[fieldX])) {
45+
nestTree.set(record[fieldX], new Map());
46+
}
47+
let node = nestTree.get(record[fieldX]);
48+
if (!node.has(record[fieldY])) {
49+
node.set(record[fieldY], 0);
50+
}
51+
node.set(record[fieldY], node.get(record[fieldY]) + 1);
4152
}
42-
const chis = chiSquared(matrix);
43-
const V = Math.sqrt(chis / (dataSource.length * Math.min(xMembers.length - 1, yMembers.length - 1)))
53+
const chis = chiSquared(nestTree, xSet, ySet);
54+
const V = Math.sqrt(chis / (dataSource.length * Math.min(xSet.size - 1, ySet.size - 1)))
4455
return V;
45-
}
56+
}
57+
58+
59+
// can be used for test
60+
// export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string): number {
61+
// const xSet = new Set(dataSource.map(d => d[fieldX]))
62+
// const ySet = new Set(dataSource.map(d => d[fieldY]))
63+
// const xMembers = [...xSet];
64+
// const yMembers = [...ySet];
65+
// let xDict = {};
66+
// let yDict = {};
67+
// for (let i = 0; i < xMembers.length; i++) {
68+
// xDict[xMembers[i]] = i;
69+
// }
70+
// for (let i = 0; i < yMembers.length; i++) {
71+
// yDict[yMembers[i]] = i;
72+
// }
73+
// // let matrix: number[][] = xMembers.map(x => yMembers.map(y => 0));
74+
// let matrix: number[][] = [];
75+
// for (let i = 0; i < xMembers.length; i++) {
76+
// matrix.push([]);
77+
// for (let j = 0; j < yMembers.length; j++) {
78+
// matrix[i].push(0);
79+
// }
80+
// }
81+
// for (let record of dataSource) {
82+
// matrix[xDict[record[fieldX]]][yDict[record[fieldY]]]++;
83+
// }
84+
// const chis = chiSquared(matrix);
85+
// const V = Math.sqrt(chis / (dataSource.length * Math.min(xMembers.length - 1, yMembers.length - 1)))
86+
// return V;
87+
// }
88+
89+
// export function chiSquared(matrix: number[][] = [[]]): number {
90+
// let rowSums = matrix.map(m => 0);
91+
// let colSums = matrix[0].map(m => 0);
92+
// let totalSum = 0;
93+
// for (let i = 0; i < matrix.length; i++) {
94+
// for (let j = 0; j < matrix[i].length; j++) {
95+
// rowSums[i] += matrix[i][j];
96+
// colSums[j] += matrix[i][j];
97+
// totalSum += matrix[i][j];
98+
// }
99+
// }
100+
// let chis = 0;
101+
// for (let i = 0; i < matrix.length; i++) {
102+
// for (let j = 0; j < matrix[i].length; j++) {
103+
// let observed = matrix[i][j];
104+
// let expected = rowSums[i] * colSums[j] / totalSum;
105+
// chis += (observed - expected) ** 2 / expected;
106+
// }
107+
// }
108+
// return chis;
109+
// }

packages/visual-insights/src/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import * as Distribution from './distribution';
77

88
import * as ImpurityMeasure from './impurityMeasure';
99

10-
import getInsightViews, { analysisDimensions, getCombination, clusterMeasures, kruskalMST } from './insights/index';
10+
import getInsightViews, { analysisDimensions, getCombination, clusterMeasures, kruskalMST, getDimSetsBasedOnClusterGroups } from './insights/index';
1111
import * as Cleaner from './cleaner/index';
1212

1313
import * as UnivariateSummary from './univariateSummary/index'
@@ -29,6 +29,7 @@ export {
2929
Cleaner,
3030
getInsightViews,
3131
getCombination,
32+
getDimSetsBasedOnClusterGroups,
3233
clusterMeasures,
3334
kruskalMST
3435
}

packages/visual-insights/src/insights/cluster.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ function unionWithEffect (parents: number[], sizes: number[], n1: number, n2: nu
5555
* @param matrix adjmatrix
5656
* @param groupNumber number of group generated by clustering
5757
*/
58-
function kruskal(matrix: number[][], groupNumber: number): Map<number, number[]> {
58+
function kruskal(matrix: number[][], groupNumber: number, threshold: number | undefined = 0): Map<number, number[]> {
5959
const edges = turnAdjMatrix2List(matrix);
6060
edges.sort((a, b) => b[1] - a[1]);
6161
const parents = matrix.map((m, i) => i);
@@ -68,7 +68,7 @@ function kruskal(matrix: number[][], groupNumber: number): Map<number, number[]>
6868
parents[i] = find(parents, i)
6969
}
7070
let set = new Set(parents);
71-
if (set.size <= groupNumber){
71+
if (set.size <= groupNumber || edge[1] < threshold){
7272
break;
7373
}
7474
}
@@ -127,7 +127,6 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number =
127127
const parents = matrix.map((m, i) => i);
128128
const cloneParents = matrix.map((m, i) => i);
129129
const sizes = matrix.map(() => 1);
130-
let inCutEdge = false;
131130
for (let edge of edges) {
132131
if (findWithEffect(parents, sizes, edge[0][0]) !== findWithEffect(parents, sizes, edge[0][1])) {
133132
if (sizes[edge[0][0]] + sizes[edge[0][1]] > limitSize) {
@@ -159,7 +158,7 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number =
159158
* @param matrix
160159
* @param groupNumber number of group generated by clustering
161160
*/
162-
export function kruskalMST(matrix: number[][], groupNumber: number = 4) {
161+
export function kruskalMST(matrix: number[][], groupNumber: number = 4, threshold: number | undefined = 0) {
163162
const edges = turnAdjMatrix2List(matrix);
164163
edges.sort((a, b) => b[1] - a[1]);
165164

@@ -180,7 +179,7 @@ export function kruskalMST(matrix: number[][], groupNumber: number = 4) {
180179
let set = new Set(parents);
181180
// TODO:
182181
// + use kruskalMST instead of kruskal.
183-
if (set.size <= groupNumber) {
182+
if (set.size <= groupNumber || edge[1] < threshold) {
184183
inCutEdge = true;
185184
} else {
186185
groups = [...parents]
@@ -199,15 +198,16 @@ interface ClusterProps {
199198
method?: string;
200199
groupMaxSize?: number;
201200
limitSize?: boolean;
201+
threshold?: number;
202202
}
203203

204-
function cluster ({ matrix, measures ,method = 'kruskal', groupMaxSize = 4, limitSize = false }: ClusterProps): string[][] {
204+
function cluster ({ matrix, measures, method = 'kruskal', groupMaxSize = 4, limitSize = false, threshold = 0 }: ClusterProps): string[][] {
205205
// const groups = kruskal({ matrix, groupMaxSize });
206206
let groups;
207207
if (limitSize) {
208208
groups = kruskalWithLimitSize(matrix, groupMaxSize)
209209
} else {
210-
groups = kruskal(matrix, groupMaxSize)
210+
groups = kruskal(matrix, groupMaxSize, threshold)
211211
}
212212

213213
let ans: string[][] = [];

packages/visual-insights/src/insights/config.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,11 @@ export const Depth = 4;
44

55
export const VisualLimit = 8;
66

7-
export const TopKPercentField = 0.8;
7+
export const TopKPercentField = 0.8;
8+
9+
export const CrammersVThreshold = 0.3;
10+
11+
export const PearsonCorrelation = {
12+
strong: 0.5,
13+
weak: 0.3
14+
};

packages/visual-insights/src/insights/impurity.ts

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import aggregate from 'cube-core';
33
import { entropy, normalize } from '../impurityMeasure';
44
import { DataSource, OperatorType } from '../commonTypes';
5+
import { crammersV } from '../dashboard/utils';
6+
import { CrammersVThreshold } from './config';
7+
import cluster from './cluster';
58
// insights like outlier and trend both request high impurity of dimension.
69
const maxVisualChannel = 8;
710
function getCombination(elements: string[], start: number = 1, end: number = elements.length): string[][] {
@@ -22,6 +25,36 @@ function getCombination(elements: string[], start: number = 1, end: number = ele
2225
}
2326
return ans
2427
}
28+
function getDimCorrelationMatrix(dataSource: DataSource, dimensions: string[]): number[][] {
29+
let matrix: number[][] = dimensions.map(d => dimensions.map(d => 0));
30+
for (let i = 0; i < dimensions.length; i++) {
31+
matrix[i][i] = 1;
32+
for(let j = i + 1; j < dimensions.length; j++) {
33+
matrix[i][j] = matrix[j][i] = crammersV(dataSource, dimensions[i], dimensions[j]);
34+
}
35+
}
36+
return matrix;
37+
}
38+
39+
export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] {
40+
const maxDimNumberInView = 4;
41+
let dimSets: string[][] = [];
42+
let dimCorrelationMatrix = getDimCorrelationMatrix(dataSource, dimensions);
43+
console.log(dimCorrelationMatrix)
44+
// groupMaxSize here means group number.
45+
let groups: string[][] = cluster({
46+
matrix: dimCorrelationMatrix,
47+
measures: dimensions,
48+
groupMaxSize: Math.round(dimensions.length / maxDimNumberInView),
49+
threshold: CrammersVThreshold
50+
});
51+
// todo: maybe a threhold would be better ?
52+
for (let group of groups) {
53+
let combineDimSet: string[][] = getCombination(group);
54+
dimSets.push(...combineDimSet);
55+
}
56+
return dimSets;
57+
}
2558

2659
export function linearMapPositive (arr: number[]): number[] {
2760
let min = Math.min(...arr);
@@ -48,7 +81,7 @@ export function correlation(dataSource: DataSource, fieldX: string, fieldY: stri
4881
export type FieldsFeature = [string[], any, number[][]];
4982
function analysisDimensions(dataSource: DataSource, dimensions: string[], measures: string[], operator: OperatorType | undefined = 'sum'): FieldsFeature[] {
5083
let impurityList: FieldsFeature[] = [];
51-
let dimSet = getCombination(dimensions)
84+
let dimSet = getDimSetsBasedOnClusterGroups(dataSource, dimensions);
5285
for (let dset of dimSet) {
5386
let impurity = {};
5487
let aggData = aggregate({

packages/visual-insights/src/insights/index.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { analysisDimensions, getCombination } from './impurity';
2-
import { TopKSingleField, TopKPercentField, Depth, VisualLimit } from './config';
1+
import { analysisDimensions, getCombination, getDimSetsBasedOnClusterGroups } from './impurity';
2+
import { TopKPercentField } from './config';
33
import { entropy, normalize } from '../impurityMeasure';
44
import { memberCount } from '../utils'
55
import cluster, { kruskalMST } from './cluster';
@@ -52,4 +52,10 @@ function getInsightViews(dataSource: DataSource, originDimensions: string[], mea
5252
}
5353

5454
export default getInsightViews;
55-
export { analysisDimensions, getCombination, cluster as clusterMeasures, kruskalMST }
55+
export {
56+
analysisDimensions,
57+
getCombination,
58+
getDimSetsBasedOnClusterGroups,
59+
cluster as clusterMeasures,
60+
kruskalMST
61+
};

packages/visual-insights/src/specification.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import { DataSource, FieldType, Field, FieldImpurity, Specification, View } from './commonTypes';
2-
import fieldsAnalysis from './fieldAnalysis';
32
import {
43
// isFieldCategory,
54
// isFieldContinous,
65
memberCount
76
} from './utils';
8-
import { FieldSummary } from './univariateSummary';
97
interface VisualElements {
108
position: number;
119
color: number;

0 commit comments

Comments
 (0)