SQL Data Lineage
A common reason to parse SAS is data lineage: understanding which tables and columns flow through PROC SQL. The parser builds a structured SQL AST with table references and query projections — you do not need to parse SQL text yourself.
Enable native SQL parsing (parseNativeSQL = true) so embedded SQL is fully analyzed.
Sample SAS
proc sql;
create table work.country_states as
select distinct country, state,
avg(latitude) as latitude,
avg(longitude) as longitude
from work.novel_corona_virus_states
where latitude ne . and longitude ne .
group by country, state;
quit;
Add Dependencies
- Kotlin
- Java
repositories {
mavenLocal()
mavenCentral()
flatDir {
dirs("deps")
}
}
dependencies {
implementation(files("deps/sas-parser-with-dependencies-1.6.5-all.jar"))
}
repositories {
mavenLocal()
mavenCentral()
flatDir {
dirs("deps")
}
}
dependencies {
implementation(files("deps/sas-parser-with-dependencies-1.6.5-all.jar"))
implementation "com.strumenta.kolasu:kolasu-javalib:1.5.96"
}
Extract CREATE TABLE columns
Walk SqlProcedure nodes, find CreateTableSqlStatement children, and read the query projections. A projection with an alias (or label) becomes the output column name; a bare ColumnExpression keeps its source column name.
- Kotlin
- Java
import com.strumenta.kolasu.traversing.walkDescendants
import com.strumenta.sas.ast.SourceFile
import com.strumenta.sas.ast.sql.ColumnExpression
import com.strumenta.sas.ast.sql.CreateTableSqlStatement
import com.strumenta.sas.ast.sql.SqlProcedure
import com.strumenta.kolasu.commercial.LicenseManager
import com.strumenta.sas.parser.SASLanguage
import java.io.File
fun formatTableRef(ref: com.strumenta.sas.ast.sql.TableRef?): String {
if (ref == null) return "?"
return if (ref.schema != null) "${ref.schema!!.schema}.${ref.table}" else ref.table
}
fun extractCreateTableLineage(sasFile: File, license: File) {
LicenseManager.registerLicense(license)
val sas = SASLanguage()
sas.parseNativeSQL = true
val root = sas.parse(sasFile).root ?: return
root.walkDescendants(SqlProcedure::class).forEach { proc ->
proc.sqlStatements.forEach { stmt ->
if (stmt is CreateTableSqlStatement) {
println("Table: ${formatTableRef(stmt.table)}")
stmt.query?.projections?.forEach { projection ->
when {
projection.alias != null ->
println(" Label: ${projection.alias!!.name}")
projection.label != null ->
println(" Label: ${projection.label}")
projection.expression is ColumnExpression ->
println(" Name: ${(projection.expression as ColumnExpression).column?.column}")
else ->
println(" Expression: ${projection.expression?.simpleNodeType}")
}
}
}
}
}
}
import com.strumenta.kolasu.javalib.Traversing;
import com.strumenta.sas.ast.SourceFile;
import com.strumenta.sas.ast.sql.ColumnExpression;
import com.strumenta.sas.ast.sql.CreateTableSqlStatement;
import com.strumenta.sas.ast.sql.Projection;
import com.strumenta.sas.ast.sql.SqlProcedure;
import com.strumenta.sas.ast.sql.SqlStatement;
import com.strumenta.kolasu.commercial.LicenseManager;
import com.strumenta.sas.parser.SASLanguage;
import java.io.File;
public class SqlLineage {
public static String formatTableRef(com.strumenta.sas.ast.sql.TableRef ref) {
if (ref == null) {
return "?";
}
if (ref.getSchema() != null) {
return ref.getSchema().getSchema() + "." + ref.getTable();
}
return ref.getTable();
}
public static void extractCreateTableLineage(File sasFile, File license) {
LicenseManager.INSTANCE.registerLicense(license);
SASLanguage sas = new SASLanguage();
sas.setParseNativeSQL(true);
SourceFile root = sas.parse(sasFile).getRoot();
if (root == null) {
return;
}
Traversing.walkDescendantsBreadthFirst(root, SqlProcedure.class, proc -> {
for (SqlStatement stmt : proc.getSqlStatements()) {
if (stmt instanceof CreateTableSqlStatement create) {
String table = formatTableRef(create.getTable());
System.out.println("Table: " + table);
if (create.getQuery() != null) {
for (Projection projection : create.getQuery().getProjections()) {
if (projection.getAlias() != null) {
System.out.println(" Label: " + projection.getAlias().getName());
} else if (projection.getLabel() != null) {
System.out.println(" Label: " + projection.getLabel());
} else if (projection.getExpression() instanceof ColumnExpression col) {
System.out.println(" Name: " + col.getColumn().getColumn());
} else if (projection.getExpression() != null) {
System.out.println(" Expression: "
+ projection.getExpression().getSimpleNodeType());
}
}
}
}
}
});
}
}
For the sample above, typical output looks like:
Table: work.country_states
Name: country
Name: state
Label: latitude
Label: longitude
The exact table qualifier depends on how the parser represents the TableRef in the CREATE TABLE statement.
Collect source tables from queries
Query nodes expose relations — each relation points at a TableRef. Walk queries inside any SQL statement to list tables read by the program.
- Kotlin
- Java
import com.strumenta.kolasu.traversing.walkDescendants
import com.strumenta.sas.ast.SourceFile
import com.strumenta.sas.ast.sql.Query
import com.strumenta.sas.ast.sql.TableRelation
fun collectSourceTables(root: SourceFile): Set<String> {
val tables = mutableSetOf<String>()
root.walkDescendants(Query::class).forEach { query ->
query.relations.forEach { relation ->
if (relation is TableRelation && relation.table != null) {
val ref = relation.table!!
val name = if (ref.schema != null) "${ref.schema!!.schema}.${ref.table}" else ref.table
tables.add(name)
}
}
}
return tables
}
import com.strumenta.kolasu.javalib.Traversing;
import com.strumenta.sas.ast.SourceFile;
import com.strumenta.sas.ast.sql.Query;
import com.strumenta.sas.ast.sql.Relation;
import com.strumenta.sas.ast.sql.TableRelation;
import java.util.HashSet;
import java.util.Set;
public class SourceTables {
public static Set<String> collectSourceTables(SourceFile root) {
Set<String> tables = new HashSet<>();
Traversing.walkDescendantsBreadthFirst(root, Query.class, query -> {
for (Relation relation : query.getRelations()) {
if (relation instanceof TableRelation tr && tr.getTable() != null) {
var ref = tr.getTable();
String name = ref.getSchema() != null
? ref.getSchema().getSchema() + "." + ref.getTable()
: ref.getTable();
tables.add(name);
}
}
});
return tables;
}
}
Combine CREATE TABLE output columns with source tables to build end-to-end lineage graphs. Pair this recipe with DATA step I/O for programs that mix SQL and DATA steps.