[GIS] Converting arcpy based python script to OGR for semi-automated Metadata creation

arcpygdalosgeoqgis

I have created a script to automatically generate base ANZLIC standard metadata using arcpy based coding. Can anyone help me convert it to OGR/GDAL python? This is so that it can be run in QGIS and work with all types of vector and raster data.

enter image description here

The working code with some comments on important sections is below.

import os, xml, arcpy, shutil, datetime
from xml.etree import ElementTree as et

Add/Replace with following

import osgeo, os
from osgeo import ogr

Count=0
DECLARATION = """<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type='text/xsl' href='ANZMeta.xsl'?>\n"""

RootDirectory=arcpy.GetParameterAsText(0)
LogDirectory=arcpy.GetParameterAsText(1)
Metadata_Template=arcpy.GetParameterAsText(2)
ANZMeta=arcpy.GetParameterAsText(3)

#RootDirectory=os.getcwd()
#LogDirectory=os.getcwd()

currentPath=RootDirectory
arcpy.env.workspace = RootDirectory
root = Tk()
InputStrings = StringVar()
Entry(root, textvariable=InputStrings).pack()

Generated_XMLs=LogDirectory+'\GeneratedXML_LOG.txt'
f = open(Generated_XMLs, 'a')
f.write("Log of Metadata Creation Process - Update: "+str(datetime.datetime.now())+"\n")
f.close()

for root, dirs, files in os.walk(RootDirectory, topdown=False):
    #print root, dirs
    for directory in dirs:
        try:
            currentPath=os.path.join(root,directory)
        except:
            pass
        os.chdir(currentPath)
        arcpy.env.workspace = currentPath
        print currentPath
#def Create_xml(currentPath):

Here we will need a similar call to list ogr accessible datasets (ogrinfo seems to only list the properties of a file -is there anything similar to arcpy.List…)

        FileList = arcpy.ListFeatureClasses()
        zone="_Zone"

        for File in FileList:
            Count+=1
            FileDesc_obj = arcpy.Describe(File)
            FileNm=FileDesc_obj.file

The code above needs to be a call to osgeo.ogr.Open(File) right?

            check_meta=os.listdir(currentPath)
            existingXML=FileNm[:FileNm.find('.')]
            existingExtension=FileNm[FileNm.find('.'):]
            print "XML: "+existingXML
            #print check_meta
            #if  existingXML+'.xml' in check_meta:
            #newMetaFile='new'
            for f in check_meta:
                if f.startswith(existingXML) and f.endswith('.xml'):
                    print "exists, file name:", f
                    newMetaFile=FileNm+"_2012Metadata.xml"
                    try:
                        shutil.copy2(f, newMetaFile)
                    except:
                        pass
                    break
                else:
                    #print "Does not exist"
                    newMetaFile=FileNm+"_BaseMetadata.xml"

            print "New meta file: "+newMetaFile+ " for: "+File
            if newMetaFile.endswith('_BaseMetadata.xml'):
                    metafile=Metadata_Template
                    shutil.copy2(metafile,newMetaFile)
                    print "copied: "+metafile

            else:
                    shutil.copy2(Metadata_Template, newMetaFile)
                    shutil.copy2(ANZMeta, 'ANZMeta.xsl')


            print "Parsing meta file: "+newMetaFile
            tree=et.parse(newMetaFile)
            print "Processing: "+str(File)


            # Update following info only if no old metadata was found

            # Update all XML files with following data
            for node in tree.findall('.//procstep/srcused'):
                node.text = str(currentPath+"\\"+existingXML+".xml")

            dt=str(datetime.datetime.now())
            for node in tree.findall('.//procstep/date'):
                node.text = str(dt[:10])
            for node in tree.findall('.//procstep/time'):
                node.text = str(dt[11:13]+dt[16:19])
            for node in tree.findall('.//metd/date'):
                node.text = str(dt[:10])

Following needs to be re-written for osgeo…

            for node in tree.findall('.//northbc'):
                node.text = str(FileDesc_obj.extent.YMax)
            for node in tree.findall('.//southbc'):
                node.text = str(FileDesc_obj.extent.YMin)
            for node in tree.findall('.//westbc'):
                node.text = str(FileDesc_obj.extent.XMin)
            for node in tree.findall('.//eastbc'):
                node.text = str(FileDesc_obj.extent.XMax)

            for node in tree.findall('.//nondig/formname'):
                node.text = "File: "+str(os.getcwd()+"\\"+File)

            for node in tree.findall('.//native/digform/formname'):
                node.text = "Type: "+str(FileDesc_obj.featureType)
            for node in tree.findall('.//avlform/nondig/formname'):
                node.text = "Ext: "+str(FileDesc_obj.extension)
            for node in tree.findall('.//avlform/digform/formname'):
                size= str(int(os.path.getsize(currentPath+"\\"+File))/int(1024))+" KB"
                node.text = "Size: "+size
                print size
            for node in tree.findall('.//theme'):
                node.text = str(FileDesc_obj.spatialReference.name +" ; EPSG: "+str(FileDesc_obj.spatialReference.factoryCode))
                print str(FileDesc_obj.spatialReference.name +" ; EPSG: "+str(FileDesc_obj.spatialReference.factoryCode))
            #print node.text
            projection_info=[]
            Zone=FileDesc_obj.spatialReference.name

            if "GCS" in str(FileDesc_obj.spatialReference.name):
                projection_info=[FileDesc_obj.spatialReference.GCSName, FileDesc_obj.spatialReference.angularUnitName, FileDesc_obj.spatialReference.datumName, FileDesc_obj.spatialReference.spheroidName]
                #print "Geographic Coordinate system"
            else:
                projection_info=[FileDesc_obj.spatialReference.datumName, FileDesc_obj.spatialReference.spheroidName, FileDesc_obj.spatialReference.angularUnitName, Zone[Zone.rfind(zone)-3:]]
                #print "Projected Coordinate system"
            x=0
            for node in tree.findall('.//spdom'):
                for node2 in node.findall('.//keyword'):
                    #print node2.text
                    node2.text = str(projection_info[x])
                    #print node2.text
                    x=x+1


            tree.write(newMetaFile)

            # CODE to add declaration back into xml (sometimes required)
##            s = open(currentPath+"\\"+newMetaFile).read()
##            s = s.replace('<anzmeta>', DECLARATION+'<anzmeta>')
##            #s = s.replace('P.O. Box 1616', 'P.O. Box 573')
##            f = open(currentPath+"\\"+newMetaFile, 'w')
##            f.write(s)
##            f.close()

            f = open(Generated_XMLs, 'a')
            f.write(str(Count)+": "+File+"; "+newMetaFile+"; "+currentPath+";"+existingXML+"\n")
            f.close()

I see that there is the following that I can use

dataSource=osgeo.ogr.Open(fn)
layer = dataSource.GetLayer(0)
projection = layer.GetSpatialRef() #Get spatial reference
feature = layer.GetFeature(0)
extent=layer.GetExtent()
numFeatures=layer.GetFeatureCount()

Best Answer

you're going in the right direction.

Unfortunately in OGR and GDAL there are no "List all datasets" functions available. This can be a bit of a pain, but it's easy enough to implement given a couple of provisos:

  • ogr.Open and gdal.Open will try and open any dataset, and will return None if it fails (note, this doesn't throw an exception)
  • Many datasets are actually made up of a number of different files on disk (such as ESRI Shapefiles or MapInfo TAB). OGR can be given any of these files and it will open the dataset (so, not just .shp, .shx and .dbf will all open the one shapefile) - this means we need to ignore them manually.
IGNORED_EXT = set([".mif", ".shx", ".dbf"]) #Map info files and ESRI shape files recognize
                                            #multiple extensions, so we ignore them
for root, dirs, files in os.walk(RootDirectory, topdown=False):
    for f in files:
        try:
            assert os.path.splitext(f)[1] not in IGNORED_EXT, "Ignoring %s" % f
            ds = ogr.Open(os.path.join(root, f))
            if ds is None:
                ds = gdal.Open(os.path.join(root, f))
            assert ds is not None, "%s is not compatible" % f
            print "Loaded %s successfully" % ds.GetName()
        except Exception, e:
            print "\t%s" % e

At this point GDAL and OGR datasets behave differently, e.g. in OGR there are Layers, in GDAL there are Bands. Try using isinstance to look at the feature type and then do something with it. For example:

isinstance(gdal.Open(ds), gdal.Dataset)

Lastly you may have some issues with getting the projection info from your data; while the ogr.Layer.GetSpatialRef() returns a osr.SpatialReference object, with gdal.Dataset.GetProjection() just returns a wkt string. To get the spatial reference object from a GDAL wkt, you can use (from the Spatial Notes blog):

srs = osr.SpatialReference()
srs.ImportFromWkt(ds.GetProjection())

The projection may not have a name, or even an associated EPSG code, but you should be able to find out all the information you need from the Spatial Reference object once you have it.

One last, last note: rather than using the standard xml library, take a look at lxml instead. Particularly the E-Factory example in the tutorial. It should make building the XML a lot easier.