Profiling scripts are associated with extensible connections and are used to perform data profiling of source systems. There are a number of environment variables and token replacements available to developers when writing profiling scripts. All profiling scripts must return information about the data profiling result in a specific JSON format.
Environment variables in scripts
The following script environment variables are available to all profiling scripts.
| Variable | Description |
|---|---|
| WSL_SRCCFG_<configured_field_name> | The values of configured connection properties can be accessed using environment variables. |
| WSL_CON_JSON | The full path to a file containing the output from the most recent browse script execution. |
| WSL_BINDIR | The WhereScape 3D installation path. (The default location is 'C:\Program Files\WhereScape\WhereScape 3D'.) |
| WSL_WORKDIR | The working directory where the script is executed. (The working directory will be in %UserProfile%\WhereScape\3D\scriptExecution.) |
| WSL_TASK_NAME | A name used to identify the workflow task. |
| WSL_SEQUENCE | A sequence id to identify each workflow process. |
| WSL_WRKFL_Repo | The repository name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Category | The category name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Model | The model name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Version | The version name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Group | The group name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Table | The table name associated with the object selected in the repositories pane. If no table object is selected then the value should be an empty string. |
| WSL_WRKFL_Column | The column name associated with the object selected in the repositories pane. If no table object is selected then the value should be an empty string. |
Profiling script output
Each profiling script should return a set of JSON containing the results of the data profiling process. An example of the output required is displayed below.
{
"table_metrics": [
{
"name": "Rowcount",
"description": "The number of rows in the table.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"value": "100"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"value": "15"
}
]
},
{
"name": "Space used in MB",
"description": "The amount of space used by the table in MB.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"value": "10"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"value": "2"
}
]
},
{
"name": "Average row length",
"description": "The average length per row in the table.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"value": "10"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"value": "6"
}
]
}
],
"column_metrics": [
{
"name": "Inferred data type",
"description": "The WhereScape 3D data type inferred from the values stored in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "INTEGER"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "VARCHAR(50)"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "INTEGER"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "INTEGER"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "VARCHAR(50)"
}
]
},
{
"name": "Distinct value count",
"description": "The number of unique values (excluding NULL) in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "52"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "4"
}
]
},
{
"name": "Is unique",
"description": "Is the data in this attribute unique for every row.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "True"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "False"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "False"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "True"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "True"
}
]
},
{
"name": "Null count",
"description": "The number of null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "0"
}
]
},
{
"name": "Null %",
"description": "The percentage of null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "0.0"
}
]
},
{
"name": "Populated count",
"description": "The number of non-null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "4"
}
]
},
{
"name": "Populated %",
"description": "The percentage of non-null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "100.0"
}
]
},
{
"name": "Min value",
"description": "The minimum value in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "01581"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "Atlanta"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "1"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "1"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "Eastern"
}
]
},
{
"name": "Max value",
"description": "The maximum value in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "98104"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "Wilton"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "Western"
}
]
},
{
"name": "Min character count",
"description": "The minimum number of characters in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "7"
}
]
},
{
"name": "Max character count",
"description": "The maximum number of characters in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "16"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "8"
}
]
},
{
"name": "Avg character count",
"description": "The average number of characters in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "9"
}
]
},
{
"name": "Max trimmed length",
"description": "The maximum trimmed length in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "16"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "8"
}
]
},
{
"name": "Sum value",
"description": "The sum of all the values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "114"
}
]
},
{
"name": "Avg value",
"description": "The average of all the values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "2.15"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "2.5"
}
]
},
{
"name": "Top 10 most frequent values",
"description": "The top 10 most frequent values in this column.",
"type": "pair_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"key": [
"01581",
"01730",
"01833",
"02116",
"02139",
"02184",
"02903",
"03049",
"03801",
"06897"
],
"value": [
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"key": [
"Atlanta",
"Austin",
"Beachwood",
"Bedford",
"Bellevue",
"Bentonville",
"Bloomfield Hills",
"Boston",
"Braintree",
"Cambridge"
],
"value": [
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"key": [
""
],
"value": ""
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"key": [
"1",
"2",
"3",
"4"
],
"value": [
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"key": [
"Eastern",
"Northern",
"Sothern",
"Western"
],
"value": [
"1",
"1",
"1",
"1"
]
}
]
},
{
"name": "Top 10 least frequent values",
"description": "The top 10 least frequent values in this column.",
"type": "pair_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"key": [
"01581",
"01730",
"01833",
"02116",
"02139",
"02184",
"02903",
"03049",
"03801",
"06897"
],
"value": [
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"key": [
"New York",
"Neward",
"Orlando",
"Philadelphia",
"Phonix",
"Portsmouth",
"Providence",
"Racine",
"Redmond",
"Rockville"
],
"value": [
"2",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"key": [
""
],
"value": ""
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"key": [
"1",
"2",
"3",
"4"
],
"value": [
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"key": [
"Eastern",
"Northern",
"Sothern",
"Western"
],
"value": [
"1",
"1",
"1",
"1"
]
}
]
}
],
"relationship_metrics": [
{
"name": "In use %",
"description": "Percentage of keys used by the referencing table.",
"type": "single_value",
"metric_values": [
{
"pk_schema": "D:\\Northwind\\csv",
"pk_table": "Region",
"pk_column": "RegionID",
"fk_schema": "D:\\Northwind\\csv",
"fk_table": "Territories",
"fk_column": "RegionID",
"value": "100.0"
}
]
},
{
"name": "Referentially invalid values %",
"description": "Percentage of values in the referencing foreign key field not in the referenced field.",
"type": "single_value",
"metric_values": [
{
"pk_schema": "D:\\Northwind\\csv",
"pk_table": "Region",
"pk_column": "RegionID",
"fk_schema": "D:\\Northwind\\csv",
"fk_table": "Territories",
"fk_column": "RegionID",
"value": "0.0"
}
]
},
{
"name": "Top 10 used PKs",
"description": "The top 10 most frequent used PK values.",
"type": "pair_value",
"metric_values": [
{
"pk_schema": "D:\\Northwind\\csv",
"pk_table": "Region",
"pk_column": "RegionID",
"fk_schema": "D:\\Northwind\\csv",
"fk_table": "Territories",
"fk_column": "RegionID",
"key": [
"1",
"2",
"3",
"4"
],
"value": [
"19",
"15",
"11",
"8"
]
}
]
}
]
}