Profiling scripts are associated with extensible connections and are used to perform data profiling of source systems. There are a number of environment variables and token replacements available to developers when writing profiling scripts. All profiling scripts must return information about the data profiling result in a specific JSON format.
The following script environment variables are available to all profiling scripts.
| Variable | Description |
|---|---|
| WSL_SRCCFG_<configured_field_name> | The values of configured connection properties can be accessed using environment variables. |
WSL_SCRIPT_<Script_Name>_CODE | The full path to the script specified. The script should be stored in the WhereScape 3D metadata and will be automatically copied to the working directory before the workflow script is executed. |
| WSL_CON_JSON | The full path to a file containing the output from the most recent browse script execution. |
| WSL_BINDIR | The WhereScape 3D installation path. (The default location is 'C:\Program Files\WhereScape\WhereScape 3D'.) |
| WSL_WORKDIR | The working directory where the script is executed. (The working directory will be in %UserProfile%\WhereScape\3D\scriptExecution.) |
| WSL_TASK_NAME | A name used to identify the profilig task. |
| WSL_SEQUENCE | A sequence id to identify each profiling process. |
| WSL_WRKFL_Repo | The repository name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Category | The category name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Model | The model name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Version | The version name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Group | The group name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Table | The table name associated with the object selected in the repositories pane. |
| WSL_WRKFL_Column | The column name associated with the object selected in the repositories pane. |
Each profiling script should return a set of JSON containing the results of the data profiling process. An example of the output required is displayed below.
{
"table_metrics": [
{
"name": "Rowcount",
"description": "The number of rows in the table.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"value": "100"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"value": "15"
}
]
},
{
"name": "Space used in MB",
"description": "The amount of space used by the table in MB.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"value": "10"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"value": "2"
}
]
},
{
"name": "Average row length",
"description": "The average length per row in the table.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"value": "10"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"value": "6"
}
]
}
],
"column_metrics": [
{
"name": "Inferred data type",
"description": "The WhereScape 3D data type inferred from the values stored in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "INTEGER"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "VARCHAR(50)"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "INTEGER"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "INTEGER"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "VARCHAR(50)"
}
]
},
{
"name": "Distinct value count",
"description": "The number of unique values (excluding NULL) in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "52"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "4"
}
]
},
{
"name": "Is unique",
"description": "Is the data in this attribute unique for every row.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "True"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "False"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "False"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "True"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "True"
}
]
},
{
"name": "Null count",
"description": "The number of null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "0"
}
]
},
{
"name": "Null %",
"description": "The percentage of null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "0.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "0.0"
}
]
},
{
"name": "Populated count",
"description": "The number of non-null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "53"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "4"
}
]
},
{
"name": "Populated %",
"description": "The percentage of non-null values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "100.0"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "100.0"
}
]
},
{
"name": "Min value",
"description": "The minimum value in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "01581"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "Atlanta"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "1"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "1"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "Eastern"
}
]
},
{
"name": "Max value",
"description": "The maximum value in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "98104"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "Wilton"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "Western"
}
]
},
{
"name": "Min character count",
"description": "The minimum number of characters in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "4"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "7"
}
]
},
{
"name": "Max character count",
"description": "The maximum number of characters in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "16"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "8"
}
]
},
{
"name": "Avg character count",
"description": "The average number of characters in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "9"
}
]
},
{
"name": "Max trimmed length",
"description": "The maximum trimmed length in at least one value of this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"value": "5"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"value": "16"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"value": "8"
}
]
},
{
"name": "Sum value",
"description": "The sum of all the values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "114"
}
]
},
{
"name": "Avg value",
"description": "The average of all the values in this column.",
"type": "single_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"value": "2.15"
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"value": "2.5"
}
]
},
{
"name": "Top 10 most frequent values",
"description": "The top 10 most frequent values in this column.",
"type": "pair_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"key": [
"01581",
"01730",
"01833",
"02116",
"02139",
"02184",
"02903",
"03049",
"03801",
"06897"
],
"value": [
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"key": [
"Atlanta",
"Austin",
"Beachwood",
"Bedford",
"Bellevue",
"Bentonville",
"Bloomfield Hills",
"Boston",
"Braintree",
"Cambridge"
],
"value": [
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"key": [
""
],
"value": ""
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"key": [
"1",
"2",
"3",
"4"
],
"value": [
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"key": [
"Eastern",
"Northern",
"Sothern",
"Western"
],
"value": [
"1",
"1",
"1",
"1"
]
}
]
},
{
"name": "Top 10 least frequent values",
"description": "The top 10 least frequent values in this column.",
"type": "pair_value",
"metric_values": [
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "Territory ID",
"key": [
"01581",
"01730",
"01833",
"02116",
"02139",
"02184",
"02903",
"03049",
"03801",
"06897"
],
"value": [
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "TerritoryDescription",
"key": [
"New York",
"Neward",
"Orlando",
"Philadelphia",
"Phonix",
"Portsmouth",
"Providence",
"Racine",
"Redmond",
"Rockville"
],
"value": [
"2",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Territories",
"column": "RegionID",
"key": [
""
],
"value": ""
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionID",
"key": [
"1",
"2",
"3",
"4"
],
"value": [
"1",
"1",
"1",
"1"
]
},
{
"schema": "D:\\Northwind\\csv",
"table": "Region",
"column": "RegionDescription",
"key": [
"Eastern",
"Northern",
"Sothern",
"Western"
],
"value": [
"1",
"1",
"1",
"1"
]
}
]
}
],
"relationship_metrics": [
{
"name": "In use %",
"description": "Percentage of keys used by the referencing table.",
"type": "single_value",
"metric_values": [
{
"pk_schema": "D:\\Northwind\\csv",
"pk_table": "Region",
"pk_column": "RegionID",
"fk_schema": "D:\\Northwind\\csv",
"fk_table": "Territories",
"fk_column": "RegionID",
"value": "100.0"
}
]
},
{
"name": "Referentially invalid values %",
"description": "Percentage of values in the referencing foreign key field not in the referenced field.",
"type": "single_value",
"metric_values": [
{
"pk_schema": "D:\\Northwind\\csv",
"pk_table": "Region",
"pk_column": "RegionID",
"fk_schema": "D:\\Northwind\\csv",
"fk_table": "Territories",
"fk_column": "RegionID",
"value": "0.0"
}
]
},
{
"name": "Top 10 used PKs",
"description": "The top 10 most frequent used PK values.",
"type": "pair_value",
"metric_values": [
{
"pk_schema": "D:\\Northwind\\csv",
"pk_table": "Region",
"pk_column": "RegionID",
"fk_schema": "D:\\Northwind\\csv",
"fk_table": "Territories",
"fk_column": "RegionID",
"key": [
"1",
"2",
"3",
"4"
],
"value": [
"19",
"15",
"11",
"8"
]
}
]
}
]
} |