Subject: Marine Geospatial Ecology Tools (MGET) help
Text archives
From: | "Jason Roberts" <> |
---|---|
To: | "'Liza Hoos'" <> |
Cc: | <> |
Subject: | RE: [mget-help] glm, predict glm from table |
Date: | Fri, 21 Sep 2012 18:15:30 -0400 |
Liza, That is a bug in MGET that we also recently encountered. I’m sorry you hit it. Thank you for the debug report with verbose logging. That allows me to very quickly confirm it is the same problem. A patched file is attached. To apply it to your system: 1. Make sure you have MGET 0.8a43 installed. (You do—I can see this in the verbose logging—but others reading this message might not.) 2. Shut down all ArcGIS programs. 3. Save the attached file one of the following directories, depending on your version of ArcGIS and Python, overwriting the file that is already there: a. Arc 9.2 - C:\Python24\Lib\site-packages\GeoEco\Statistics b. Arc 9.3 - C:\Python25\Lib\site-packages\GeoEco\Statistics c. Arc 10.0 - C:\Python26\ArcGIS10.0\Lib\site-packages\GeoEco\Statistics d. Arc 10.1 - C:\Python27\ArcGIS10.1\Lib\site-packages\GeoEco\Statistics Then try again. Please let me know if it works. Best, Jason From: Liza Hoos [mailto:] Hello, |
#
# Copyright (C) 2007 Jason J. Roberts and Ben D. Best
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License (available in the file LICENSE.TXT)
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
import copy
import glob
import os.path
import random
import sys
import types
from GeoEco.ArcGIS import GeoprocessorManager
from GeoEco.DatabaseAccess.ArcGIS import ArcGIS91DatabaseConnection
from GeoEco.DataManagement.ArcGISRasters import ArcGISRaster
from GeoEco.DataManagement.Directories import TemporaryDirectory
from GeoEco.DataManagement.Files import File
from GeoEco.DynamicDocString import DynamicDocString
from GeoEco.Internationalization import _
from GeoEco.Logging import Logger
from GeoEco.R import R, RPackageDependency
# Public classes
class GLM(object):
__doc__ = DynamicDocString()
@classmethod
def FitToArcGISTable(cls, inputTable, outputModelFile, formula,
family=u'gaussian', where=None, link=None,
variance=None,
xColumnName=None, yColumnName=None,
zColumnName=None, mColumnName=None,
selectionMethod=None, logSelectionDetails=True,
writeSummaryFile=True, writeDiagnosticPlots=True,
numDiagLabels=3, diagLabelField=None, writeTermPlots=True, residuals=False,
xAxis=True, commonScale=True, plotFileFormat=u'png', res=1000., width=3000.,
height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# If the caller requested emf format for the plot files,
# convert the width and height from thousands of an inch to
# inches.
if plotFileFormat == u'emf':
width = width / 1000
height = height / 1000
# Load the table into a temporary data frame.
if diagLabelField is not None:
extraFieldsToLoad = [diagLabelField]
else:
extraFieldsToLoad = None
dataFrameName, xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam, familyParam =
_LoadDataFrameForModelFitting(inputTable, {_(u'formula') : formula},
{_(u'formula') : True}, family, link, variance, None, where, xColumnName,
yColumnName, zColumnName, mColumnName, extraFieldsToLoad)
# Fit the GLM and create the output files in the temp
# directory.
r = R.GetInterpreter()
try:
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'SummarizeModel.r'), False)
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'FitGLMForDataframe.r'), False)
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'output')
if selectionMethod is None:
selectionMethodParam = u'NULL'
else:
selectionMethodParam = u'"' + selectionMethod + u'"'
if diagLabelField is None:
diagLabelField = u'NULL'
else:
diagLabelField = u'"' + diagLabelField + u'"'
numTermPlots = r('FitGLMForDataframe(f=%s, d=%s, fam=%s,
outputModelFile="%s", xVar=%s, yVar=%s, zVar=%s, mVar=%s,
coordinateSystem=%s, selectionMethod=%s, logSelectionDetails=%s,
writeSummaryFile=%s, writeDiagnosticPlots=%s, numDiagLabels=%i,
diagLabelField=%s, writeTermPlots=%s, partial.resid=%s, xAxis=%s,
commonScale=%s, plotFileFormat="%s", res=%f, width=%f, height=%f,
pointSize=%f, bg="%s")' % (formula, dataFrameName, familyParam,
tempOutputFile.replace('\\', '\\\\'), xColumnParam, yColumnParam,
zColumnParam, mColumnParam, coordinateSystemParam, selectionMethodParam,
str(logSelectionDetails).upper(), str(writeSummaryFile).upper(),
str(writeDiagnosticPlots).upper(), numDiagLabels, diagLabelField,
str(writeTermPlots).upper(), str(residuals).upper(), str(xAxis).upper(),
str(commonScale).upper(), plotFileFormat, res, width, height, pointSize, bg))
# Move the temporary output files to the requested
# location.
File.MoveSilent(tempOutputFile, outputModelFile,
overwriteExisting=overwriteExisting)
outputFilePrefix = os.path.splitext(outputModelFile)[0]
if writeSummaryFile:
File.MoveSilent(os.path.join(tempDir.Path,
u'output_summary.txt'), outputFilePrefix + u'_summary.txt',
overwriteExisting=overwriteExisting)
if writeDiagnosticPlots and
os.path.isfile(os.path.join(tempDir.Path, u'output_resid_fit.%s' %
plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_resid_fit.%s' % plotFileFormat), outputFilePrefix + u'_resid_fit.%s'
% plotFileFormat, overwriteExisting=overwriteExisting)
File.MoveSilent(os.path.join(tempDir.Path, u'output_qq.%s' %
plotFileFormat), outputFilePrefix + u'_qq.%s' % plotFileFormat,
overwriteExisting=overwriteExisting)
File.MoveSilent(os.path.join(tempDir.Path,
u'output_scale_loc.%s' % plotFileFormat), outputFilePrefix + u'_scale_loc.%s'
% plotFileFormat, overwriteExisting=overwriteExisting)
File.MoveSilent(os.path.join(tempDir.Path, u'output_cooks.%s'
% plotFileFormat), outputFilePrefix + u'_cooks.%s' % plotFileFormat,
overwriteExisting=overwriteExisting)
File.MoveSilent(os.path.join(tempDir.Path,
u'output_resid_lev.%s' % plotFileFormat), outputFilePrefix + u'_resid_lev.%s'
% plotFileFormat, overwriteExisting=overwriteExisting)
File.MoveSilent(os.path.join(tempDir.Path,
u'output_cooks_lev.%s' % plotFileFormat), outputFilePrefix + u'_cooks_lev.%s'
% plotFileFormat, overwriteExisting=overwriteExisting)
if writeTermPlots and numTermPlots > 0:
for i in range(int(numTermPlots)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_term%02i.%s' % (i+1, plotFileFormat)), outputFilePrefix +
(u'_term%02i.%s' % (i+1, plotFileFormat)),
overwriteExisting=overwriteExisting)
# Delete the data frame.
finally:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
@classmethod
def PredictFromArcGISTable(cls, inputModelFile, inputTable=None,
predictedValuesField=None, cutoff=None,
where=None, ignoreOutOfRangeValues=True,
noDataValue=None,
outputPlotFile=None, measure1=u'tpr',
measure2=u'fpr', colorize=True,
outputSummaryFile=None, res=1000.,
width=3000., height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
return _PredictFromArcGISTable('glm', _(u'Fit GLM'), inputModelFile,
inputTable, predictedValuesField, cutoff, where, ignoreOutOfRangeValues,
noDataValue, outputPlotFile, measure1, measure2, colorize, outputSummaryFile,
res, width, height, pointSize, bg, overwriteExisting)
@classmethod
def PredictFromArcGISRasters(cls, inputModelFile, outputResponseRaster,
templateRaster=None, rasterPredictorNames=None, predictorRasters=None,
constantPredictorNames=None, constantPredictorValues=None, cutoff=None,
resamplingTechniques=None, ignoreOutOfRangeValues=True,
outputErrorRaster=None, buildPyramids=False, overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
_PredictFromArcGISRasters('glm', _(u'Fit GLM'), inputModelFile,
outputResponseRaster, cutoff, constantPredictorNames,
constantPredictorValues, rasterPredictorNames, predictorRasters,
templateRaster, resamplingTechniques, ignoreOutOfRangeValues,
outputErrorRaster, buildPyramids, overwriteExisting)
class GAM(object):
__doc__ = DynamicDocString()
@classmethod
def FitToArcGISTable(cls, inputTable, outputModelFile, formula,
family=u'gaussian', rPackage=u'mgcv',
where=None, link=None, variance=None, theta=None,
method=u'GCV.Cp', optimizer=u'outer', alternativeOptimizer=u'newton',
xColumnName=None, yColumnName=None,
zColumnName=None, mColumnName=None,
selectionMethod=None, logSelectionDetails=True,
writeSummaryFile=True, writeDiagnosticPlots=True,
writeTermPlots=True, residuals=False, xAxis=True, commonScale=True,
plotFileFormat=u'png', res=1000., width=3000., height=3000., pointSize=10.0,
bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# Perform additional parameter validation.
if rPackage == u'mgcv' and selectionMethod is not None:
Logger.RaiseException(ValueError(_(u'Automated model selection is
not currently supported when the GAM is fitted using the R mgcv package. To
perform automated model selection, you must use the gam package. As an
alternative, you can use the mgcv package and use splines with "shrinkage".
See the documentation for the Automated Model Selection Method parameter for
more information. ')))
if family == u'negbin':
if rPackage != u'mgcv':
Logger.RaiseException(ValueError(_(u'The "negbin" (negative
binomial) model family is only available when the GAM is fitted using the R
mgcv package.')))
if theta is None:
Logger.RaiseException(ValueError(_(u'The "negbin" (negative
binomial) model family was specifed but the theta parameter was omitted. You
must specify a value for the theta parameter when using the negbin model
family.')))
# If the caller requested emf format for the plot files,
# convert the width and height from thousands of an inch to
# inches.
if plotFileFormat == u'emf':
width = width / 1000
height = height / 1000
# Load the R package.
dependency = RPackageDependency(rPackage)
dependency.Initialize()
# Load the table into a temporary data frame.
dataFrameName, xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam, familyParam =
_LoadDataFrameForModelFitting(inputTable, {_(u'formula') : formula},
{_(u'formula') : True}, family, link, variance, theta, where, xColumnName,
yColumnName, zColumnName, mColumnName)
# Fit the GLM and create the output files in the temp
# directory.
r = R.GetInterpreter()
try:
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'SummarizeModel.r'), False)
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'FitGAMForDataframe.r'), False)
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'output')
if method is None:
methodParam = u'NULL'
else:
methodParam = u'"' + method + u'"'
if optimizer is None:
optimizerParam = u'NULL'
elif optimizer != u'outer':
optimizerParam = u'"' + optimizer + u'"'
else:
if alternativeOptimizer is None:
alternativeOptimizer = u'newton'
optimizerParam = u'c("' + optimizer + u'", "' +
alternativeOptimizer + u'")'
if selectionMethod is None:
selectionMethodParam = u'NULL'
else:
selectionMethodParam = u'"' + selectionMethod + u'"'
if rPackage != u'mgcv':
writeDiagnosticPlots = False
numTermPlots = r('FitGAMForDataframe(f=%s, d=%s, fam=%s,
rPackage="%s", outputModelFile="%s", method=%s, optimizer=%s, xVar=%s,
yVar=%s, zVar=%s, mVar=%s, coordinateSystem=%s, selectionMethod=%s,
logSelectionDetails=%s, writeSummaryFile=%s, writeDiagnosticPlots=%s,
writeTermPlots=%s, partial.resid=%s, xAxis=%s, commonScale=%s,
plotFileFormat="%s", res=%f, width=%f, height=%f, pointSize=%f, bg="%s")' %
(formula, dataFrameName, familyParam, rPackage,
tempOutputFile.replace('\\', '\\\\'),
methodParam, optimizerParam,
xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam,
selectionMethodParam,
str(logSelectionDetails).upper(),
str(writeSummaryFile).upper(),
str(writeDiagnosticPlots).upper(), str(writeTermPlots).upper(),
str(residuals).upper(), str(xAxis).upper(), str(commonScale).upper(),
plotFileFormat, res, width, height, pointSize, bg))
# Move the temporary output files to the requested
# location.
File.MoveSilent(tempOutputFile, outputModelFile,
overwriteExisting=overwriteExisting)
outputFilePrefix = os.path.splitext(outputModelFile)[0]
if writeSummaryFile:
File.MoveSilent(os.path.join(tempDir.Path,
u'output_summary.txt'), outputFilePrefix + u'_summary.txt',
overwriteExisting=overwriteExisting)
if writeDiagnosticPlots and
os.path.isfile(os.path.join(tempDir.Path, u'output_diag.%s' %
plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path, u'output_diag.%s'
% plotFileFormat), outputFilePrefix + u'_diag.%s' % plotFileFormat,
overwriteExisting=overwriteExisting)
if writeTermPlots and numTermPlots > 0:
for i in range(int(numTermPlots)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_term%02i.%s' % (i+1, plotFileFormat)), outputFilePrefix +
(u'_term%02i.%s' % (i+1, plotFileFormat)),
overwriteExisting=overwriteExisting)
# Delete the data frame.
finally:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
@classmethod
def PredictFromArcGISTable(cls, inputModelFile, inputTable=None,
predictedValuesField=None, cutoff=None,
where=None, ignoreOutOfRangeValues=True,
noDataValue=None,
outputPlotFile=None, measure1=u'tpr',
measure2=u'fpr', colorize=True,
outputSummaryFile=None, res=1000.,
width=3000., height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
return _PredictFromArcGISTable('gam', _(u'Fit GAM'), inputModelFile,
inputTable, predictedValuesField, cutoff, where, ignoreOutOfRangeValues,
noDataValue, outputPlotFile, measure1, measure2, colorize, outputSummaryFile,
res, width, height, pointSize, bg, overwriteExisting)
@classmethod
def PredictFromArcGISRasters(cls, inputModelFile, outputResponseRaster,
templateRaster=None, rasterPredictorNames=None, predictorRasters=None,
constantPredictorNames=None, constantPredictorValues=None, cutoff=None,
resamplingTechniques=None, ignoreOutOfRangeValues=True,
outputErrorRaster=None, buildPyramids=False, overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
_PredictFromArcGISRasters('gam', _(u'Fit GAM'), inputModelFile,
outputResponseRaster, cutoff, constantPredictorNames,
constantPredictorValues, rasterPredictorNames, predictorRasters,
templateRaster, resamplingTechniques, ignoreOutOfRangeValues,
outputErrorRaster, buildPyramids, overwriteExisting)
@classmethod
def BayesPredictFromArcGISRasters(cls, inputModelFile,
inputPredictorRasters, variableNames, thresholds, outputProbabilityRasters,
templateRaster=None, resamplingTechniques=None, ignoreOutOfRangeValues=True,
samples=1000, chunks=100, buildPyramids=False, overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
raise NotImplementedError(_(u'This tool is not fully implemented.
Please contact the MGET development team for assistance.'))
# Perform additional parameter validation.
inputPredictorRastersLower = map(lambda s: unicode(s).lower(),
inputPredictorRasters)
for i in range(len(outputProbabilityRasters)):
raster = outputProbabilityRasters[i].lower()
if raster in inputPredictorRastersLower:
Logger.RaiseException(ValueError(_(u'The output raster
%(out)s also appears in the list of input predictor rasters. This is not
allowed. Please specify a different output raster or remove it from the list
of input predictor rasters.') % {u'out': outputProbabilityRasters[i]}))
# Load the fitted model.
r = R.GetInterpreter()
gp = GeoprocessorManager.GetWrappedGeoprocessor()
r('load("%s")' % inputModelFile.replace('\\', '\\\\'))
try:
if not r('exists("model")'):
Logger.RaiseException(ValueError(_('The input model file
%(file)s does not contain a variable called "model", indicating that it was
not generated by the Fit GAM tool. Please provide a file that was generated
by the Fit GAM tool.') % {u'file': inputModelFile}))
if 'gam' not in r('class(model)'):
Logger.RaiseException(ValueError(_('The input model file
%(file)s contains a variable called "model" but it is not a GAM, indicating
that it was not generated by the Fit GAM tool. Please provide a file that was
generated by the Fit GAM tool.') % {u'file': inputModelFile}))
if not r('exists("rPackage")') or r['rPackage'] != 'mgcv':
Logger.RaiseException(ValueError(_('The input model in file
%(file)s was fitted by the R %(pkg)s package, but this tool requires that the
model be fitted with the mgcv package. Please refit the model using the mgcv
package and try again.') % {u'file': inputModelFile, u'pkg': r['rPackage']}))
Logger.Info(_(u'Loaded %(type)s model from %(file)s. The model
was fitted with the R %(pkg)s package.') % {u'type': r('class(model)[1]'),
u'file': inputModelFile, u'pkg': r['rPackage']})
# Log a summery of the model, to remind the user what it is.
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[GLM.__module__].__file__),
'SummarizeModel.r'), False)
r('writeLines("")')
r('writeLines("MODEL SUMMARY:")')
r('writeLines("==============")')
r('writeLines(SummarizeModel(model))')
r('writeLines("")')
if r('model$family$family') != 'binomial' or
r('model$family$link') != 'logit':
Logger.RaiseException(ValueError(_('The input model in the
file %(file)s is of the %(fam)s family and uses the %(link)s link function.
This tool requires the model to be of the binomial family and use the logit
link function. Please refit the model and try again. If this is not
appropriate for your modeling problem, please contact the author of this tool
for assistance.') % {u'file': inputModelFile, u'pkg': r['rPackage'], u'fam':
r('model$family$family'), u'link': r('model$family$link')}))
# Prepare the input rasters for the prediction.
tempDir = TemporaryDirectory()
_PreparePredictorRasters(r, gp, tempDir, variableNames,
inputPredictorRasters, templateRaster, resamplingTechniques)
# Do the prediction. The prediction rasters are
# returned as ArcInfo ASCII grids because rgdal cannot
# write rasters (ArcInfo binary grids).
#
# Force the logging system to log R Info messages as
# Debug so that the user is not spammed with "Closing
# GDAL dataset handle" messages. There appears to be
# no way to suppress these messages from within R.
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[GLM.__module__].__file__),
'BayesPredictGAMForArcGISRasters.r'), False)
oldLoggingLevel = r.LogInfoAsDebug
r.LogInfoAsDebug = True
try:
if chunks is not None:
chunks = str(chunks)
else:
chunks = 'NULL'
for i in range(len(outputProbabilityRasters)):
Logger.Info(_(u'Predicting for threshold %g...') %
thresholds[i])
r('BayesPredictGAMForArcGISRasters(model,
rastersForPredictors, %g, "%s", %s, %i, %s)' % (thresholds[i],
os.path.join(tempDir.Path, u'temp_output%i.txt' % i).replace('\\', '\\\\'),
str(ignoreOutOfRangeValues).upper(), samples, chunks))
finally:
r.LogInfoAsDebug = oldLoggingLevel
# Convert the ArcInfo ASCII grids to rasters.
Logger.Info(_(u'Creating outputs...'))
for i in range(len(outputProbabilityRasters)):
tempOutputRaster = os.path.join(tempDir.Path, u'output%i' % i)
gp.ASCIIToRaster_Conversion(os.path.join(tempDir.Path,
u'temp_output%i.txt' % i), tempOutputRaster, u'FLOAT')
gp.DefineProjection_management(tempOutputRaster,
coordinateSystem) # TODO: Fix this
if buildPyramids:
gp.BuildPyramids_Management(tempOutputRaster)
# Copy the output rasters to the requested destinations.
for i in range(len(outputProbabilityRasters)):
ArcGISRaster.CopySilent(os.path.join(tempDir.Path,
u'output%i' % i), outputProbabilityRasters[i],
overwriteExisting=overwriteExisting)
# Delete R variables assigned by this function.
finally:
r('if (exists("rastersForPredictors"))
rm("rastersForPredictors")')
r('if (exists("model")) rm("model")')
r('if (exists("rPackage")) rm("rPackage")')
r('if (exists("xVar")) rm("xVar")')
r('if (exists("yVar")) rm("yVar")')
r('if (exists("zVar")) rm("zVar")')
r('if (exists("mVar")) rm("mVar")')
r('if (exists("coordinateSystem")) rm("coordinateSystem")')
class TreeModel(object):
__doc__ = DynamicDocString()
@classmethod
def FitToArcGISTable(cls, inputTable, outputModelFile, formula, method,
where=None, allowMissingCovariates=True,
minSplit=20, minBucket=7, cp=0.01, maxCompete=4, maxSurrogate=5,
useSurrogate=2, surrogateStyle=0, xval=1000, maxDepth=30, pruningMethod=None,
pruningCP=None,
xColumnName=None, yColumnName=None,
zColumnName=None, mColumnName=None,
writeSummaryFile=True, writeDiagnosticPlots=True,
writeTreePlot=True, writePrunedTreePlot=True, plotFileFormat=u'png',
res=1000., width=3000., height=3000., pointSize=10.0, bg=u'white',
treePlotType=0, extra=1, percentage=True,
under=True, clipRightLabels=True, fallenLeaves=False, branchType=0,
branch=0.2, uniform=True, digits=2, varlen=0, faclen=0, cex=None, tweak=1.,
compress=True, ycompress=True,
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# Perform additional parameter validation.
if pruningMethod == u'user specified cp' and pruningCP is None:
Logger.RaiseException(ValueError(_(u'The Pruning Method is \'User
specified\' but no Complexity Parameter for Pruning was provided. Please
provide a complexity parameter for pruning or select a different Pruning
Method.')))
# If the caller requested emf format for the plot files,
# convert the width and height from thousands of an inch to
# inches.
if plotFileFormat == u'emf':
width = width / 1000
height = height / 1000
# Load the table into a temporary data frame.
dataFrameName, xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam =
_LoadDataFrameForModelFitting(inputTable, {_(u'formula') : formula},
{_(u'formula') : True}, None, None, None, None, where, xColumnName,
yColumnName, zColumnName, mColumnName)[:-1] # We do not need the
familyParam
# Fit the tree model and create the output files in the temp
# directory.
r = R.GetInterpreter()
try:
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'FitTreeModelForDataframe.r'), False)
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'output')
if pruningMethod is None:
pruningMethod = u'NULL'
else:
pruningMethod = u'"' + pruningMethod + u'"'
if pruningCP is None:
pruningCP = u'NULL'
else:
pruningCP = repr(pruningCP)
if cex is None:
cex = u'NULL'
else:
cex = repr(cex)
r('FitTreeModelForDataframe(f=%s, d=%s, outputModelFile="%s",
method="%s", allowMissingCovariates=%s, minSplit=%i, minBucket=%i, cp=%f,
maxCompete=%i, maxSurrogate=%i, useSurrogate=%i, surrogateStyle=%i, xval=%i,
maxDepth=%i, pruningMethod=%s, pruningCP=%s, xVar=%s, yVar=%s, zVar=%s,
mVar=%s, coordinateSystem=%s, writeSummaryFile=%s, writeDiagnosticPlots=%s,
writeTreePlot=%s, writePrunedTreePlot=%s, plotFileFormat="%s", res=%f,
width=%f, height=%f, pointSize=%f, bg="%s", treePlotType=%i, extra=%i,
percentage=%s, under=%s, clipRightLabels=%s, fallenLeaves=%s, branchType=%i,
branch=%f, uniform=%s, digits=%i, varlen=%i, faclen=%i, cex=%s, tweak=%f,
compress=%s, ycompress=%s)' %
(formula, dataFrameName, tempOutputFile.replace('\\', '\\\\'),
method,
str(allowMissingCovariates).upper(), minSplit, minBucket, cp,
maxCompete, maxSurrogate, useSurrogate, surrogateStyle, xval, maxDepth,
pruningMethod, pruningCP,
xColumnParam, yColumnParam, zColumnParam, mColumnParam,
coordinateSystemParam,
str(writeSummaryFile).upper(),
str(writeDiagnosticPlots).upper(), str(writeTreePlot).upper(),
str(writePrunedTreePlot).upper(), plotFileFormat, res, width, height,
pointSize, bg,
treePlotType, extra, str(percentage).upper(),
str(under).upper(), str(clipRightLabels).upper(), str(fallenLeaves).upper(),
branchType, branch, str(uniform).upper(), digits, varlen, faclen, cex, tweak,
str(compress).upper(), str(ycompress).upper()))
# Move the temporary output files to the requested
# location.
File.MoveSilent(tempOutputFile, outputModelFile,
overwriteExisting=overwriteExisting)
outputFilePrefix = os.path.splitext(outputModelFile)[0]
if writeSummaryFile:
File.MoveSilent(os.path.join(tempDir.Path,
u'output_summary.txt'), outputFilePrefix + u'_summary.txt',
overwriteExisting=overwriteExisting)
if writeDiagnosticPlots:
if os.path.isfile(os.path.join(tempDir.Path, u'output_cp.' +
plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path, u'output_cp.'
+ plotFileFormat), outputFilePrefix + u'_cp.' + plotFileFormat,
overwriteExisting=overwriteExisting)
if os.path.isfile(os.path.join(tempDir.Path,
u'output_rsquare.' + plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_rsquare.' + plotFileFormat), outputFilePrefix + u'_rsquare.' +
plotFileFormat, overwriteExisting=overwriteExisting)
if os.path.isfile(os.path.join(tempDir.Path,
u'output_residuals.' + plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_residuals.' + plotFileFormat), outputFilePrefix + u'_residuals.' +
plotFileFormat, overwriteExisting=overwriteExisting)
if os.path.isfile(os.path.join(tempDir.Path,
u'output_pruned_residuals.' + plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_pruned_residuals.' + plotFileFormat), outputFilePrefix +
u'_pruned_residuals.' + plotFileFormat, overwriteExisting=overwriteExisting)
if writeTreePlot and os.path.isfile(os.path.join(tempDir.Path,
u'output_unpruned_tree.' + plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_unpruned_tree.' + plotFileFormat), outputFilePrefix +
u'_unpruned_tree.' + plotFileFormat, overwriteExisting=overwriteExisting)
if pruningMethod != u'NULL' and writePrunedTreePlot and
os.path.isfile(os.path.join(tempDir.Path, u'output_pruned_tree.' +
plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_pruned_tree.' + plotFileFormat), outputFilePrefix + u'_pruned_tree.'
+ plotFileFormat, overwriteExisting=overwriteExisting)
# Delete the data frame.
finally:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
@classmethod
def PredictFromArcGISTable(cls, inputModelFile, inputTable=None,
predictedValuesField=None, cutoff=None,
where=None, ignoreOutOfRangeValues=True,
noDataValue=None,
outputPlotFile=None, measure1=u'tpr',
measure2=u'fpr', colorize=True,
outputSummaryFile=None, res=1000.,
width=3000., height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
return _PredictFromArcGISTable('rpart', _(u'Fit Tree Model'),
inputModelFile, inputTable, predictedValuesField, cutoff, where,
ignoreOutOfRangeValues, noDataValue, outputPlotFile, measure1, measure2,
colorize, outputSummaryFile, res, width, height, pointSize, bg,
overwriteExisting)
@classmethod
def PredictFromArcGISRasters(cls, inputModelFile, outputResponseRaster,
templateRaster=None, rasterPredictorNames=None, predictorRasters=None,
constantPredictorNames=None, constantPredictorValues=None, cutoff=None,
resamplingTechniques=None, ignoreOutOfRangeValues=True, buildPyramids=False,
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
_PredictFromArcGISRasters('rpart', _(u'Fit Tree Model'),
inputModelFile, outputResponseRaster, cutoff, constantPredictorNames,
constantPredictorValues, rasterPredictorNames, predictorRasters,
templateRaster, resamplingTechniques, ignoreOutOfRangeValues, None,
buildPyramids, overwriteExisting)
class LinearMixedModel(object):
__doc__ = DynamicDocString()
@classmethod
def FitToArcGISTable(cls, inputTable, outputModelFile, fixedFormula,
randomFormula,
where=None, method=u'REML',
correlationStructure=None, correlationFormula=None, range_=None, nugget=None,
metric=None, fixed=False,
xColumnName=None, yColumnName=None,
zColumnName=None, mColumnName=None,
writeSummaryFile=True, writeDiagnosticPlots=True,
plotFileFormat=u'png', res=1000., width=3000., height=3000., pointSize=10.0,
bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# Perform additional validation.
if correlationStructure is not None and correlationFormula is None or
correlationStructure is None and correlationFormula is not None:
Logger.RaiseException(ValueError(_('To fit a model that includes
for within-group correlation, you must specify both the a correlation
structure and a correlation formula. If you do not wish to include
within-group correlation, you must omit both of those parameters.')))
if correlationStructure is not None and range_ is None and nugget is
not None:
Logger.RaiseException(ValueError(_('If the Nugget parameter is
specified, the Range parameter must also be specified.')))
# If the caller requested emf format for the plot files,
# convert the width and height from thousands of an inch to
# inches.
if plotFileFormat == u'emf':
width = width / 1000
height = height / 1000
# Load the table into a temporary data frame.
dataFrameName, xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam, familyParam =
_LoadDataFrameForModelFitting(inputTable, {_(u'fixed effects formula') :
fixedFormula, _(u'random effects formula') : randomFormula, _(u'correlation
formula') : correlationFormula}, {_(u'fixed effects formula') : True,
_(u'random effects formula') : False, _(u'correlation formula') : False},
None, None, None, None, where, xColumnName, yColumnName, zColumnName,
mColumnName, None)
# Fit the model with the nlme package and create the output
# files in the temp directory.
r = R.GetInterpreter()
try:
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'SummarizeModel.r'), False)
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'FitLMEForDataframe.r'), False)
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'output')
if randomFormula is None:
randomFormula = u'NULL'
if correlationStructure is not None:
if correlationStructure == u'exponential':
correlation = u'corExp('
elif correlationStructure == u'gaussian':
correlation = u'corGaus('
elif correlationStructure == u'linear':
correlation = u'corLin('
elif correlationStructure == u'rational quadratic':
correlation = u'corRatio('
elif correlationStructure == u'spherical':
correlation = u'corSpher('
else:
Logger.RaiseException(RuntimeError(_('Unknown
correlationStructure value \'%(cs)s\'. Please contact the author of this tool
for assistance.') % {u'cs': correlationStructure}))
if range_ is not None:
if nugget is None:
correlation += repr(range_) + ', '
else:
correlation += u'c(' + repr(range_) + u', ' +
repr(nugget) + u'), '
correlation += u'form=' + correlationFormula + u', nugget=' +
repr(nugget is not None).upper()
if metric is not None:
correlation += u', metric="' + metric + u'"'
correlation += u', fixed=' + repr(fixed).upper() + u')'
else:
correlation = u'NULL'
r('library(nlme)') # Required here so we can instantiate
correlation classes
r('FitLMEForDataframe(fixed=%s, data=%s, outputModelFile="%s",
random=%s, correlation=%s, method="%s", xVar=%s, yVar=%s, zVar=%s, mVar=%s,
coordinateSystem=%s, writeSummaryFile=%s, writeDiagnosticPlots=%s,
plotFileFormat="%s", res=%f, width=%f, height=%f, pointSize=%f, bg="%s")' %
(fixedFormula, dataFrameName, tempOutputFile.replace('\\',
'\\\\'), randomFormula, correlation, method,
xColumnParam, yColumnParam, zColumnParam, mColumnParam,
coordinateSystemParam,
str(writeSummaryFile).upper(),
str(writeDiagnosticPlots).upper(), plotFileFormat, res, width, height,
pointSize, bg))
# Move the temporary output files to the requested
# location.
File.MoveSilent(tempOutputFile, outputModelFile,
overwriteExisting=overwriteExisting)
outputFilePrefix = os.path.splitext(outputModelFile)[0]
if writeSummaryFile:
File.MoveSilent(os.path.join(tempDir.Path,
u'output_summary.txt'), outputFilePrefix + u'_summary.txt',
overwriteExisting=overwriteExisting)
if writeDiagnosticPlots:
files = glob.glob(os.path.join(tempDir.Path, u'output_*.' +
plotFileFormat))
for f in files:
File.MoveSilent(f, outputFilePrefix + '_' +
os.path.basename(f).split('_', 1)[1], overwriteExisting=overwriteExisting)
# Delete the data frame.
finally:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
@classmethod
def PredictFromArcGISRasters(cls, inputModelFile, outputResponseRaster,
templateRaster=None, rasterPredictorNames=None, predictorRasters=None,
constantPredictorNames=None, constantPredictorValues=None, cutoff=None,
resamplingTechniques=None, ignoreOutOfRangeValues=True, buildPyramids=False,
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
_PredictFromArcGISRasters('lme', _(u'Fit Linear Mixed Model'),
inputModelFile, outputResponseRaster, cutoff, constantPredictorNames,
constantPredictorValues, rasterPredictorNames, predictorRasters,
templateRaster, resamplingTechniques, ignoreOutOfRangeValues, None,
buildPyramids, overwriteExisting)
class RandomForestModel(object):
__doc__ = DynamicDocString()
@classmethod
def FitToArcGISTable(cls, inputTable, outputModelFile, formula,
ntree=500, mtry=None, rPackage=u'randomForest',
where=None, replace=False, cfMaxSurrogate=None,
seed=None, importance=True, useScaledImportance=False,
useConditionalImportance=False,
xColumnName=None, yColumnName=None,
zColumnName=None, mColumnName=None,
writeSummaryFile=True, writeImportancePlot=True,
writePartialDependencePlots=False, plotFileFormat=u'png', res=1000.,
width=3000., height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# If the caller requested emf format for the plot files,
# convert the width and height from thousands of an inch to
# inches.
if plotFileFormat == u'emf':
width = width / 1000
height = height / 1000
# Load the R package.
dependency = RPackageDependency(rPackage)
dependency.Initialize()
# Load the table into a temporary data frame.
dataFrameName, xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam, familyParam =
_LoadDataFrameForModelFitting(inputTable, {_(u'formula') : formula},
{_(u'formula') : True}, None, None, None, None, where, xColumnName,
yColumnName, zColumnName, mColumnName)
# Fit the model and create the output files in the temp
# directory.
r = R.GetInterpreter()
try:
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'Utils.r'), False)
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'FitRandomForestForDataframe.r'), False)
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'output')
# Fit the model.
if mtry is not None:
mtry = str(mtry)
else:
mtry = 'NULL'
if cfMaxSurrogate is not None:
cfMaxSurrogate = str(cfMaxSurrogate)
else:
cfMaxSurrogate = 'NULL'
if seed is not None:
seed = str(seed)
else:
seed = 'NULL'
classes, terms = r('FitRandomForestForDataframe(f=%s,
trainingData=%s, ntree=%i, mtry=%s, rPackage="%s", outputModelFile="%s",
replace=%s, cfMaxSurrogate=%s, seed=%s, importance=%s,
useScaledImportance=%s, useConditionalImportance=%s, xVar=%s, yVar=%s,
zVar=%s, mVar=%s, coordinateSystem=%s, writeSummaryFile=%s,
writeImportancePlot=%s, writePartialDependencePlots=%s, plotFileFormat="%s",
res=%f, width=%f, height=%f, pointSize=%f, bg="%s")' %
(formula, dataFrameName, ntree, mtry,
rPackage, tempOutputFile.replace('\\', '\\\\'),
str(replace).upper(), cfMaxSurrogate, seed,
str(importance).upper(), str(useScaledImportance).upper(),
str(useConditionalImportance).upper(),
xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam,
str(writeSummaryFile).upper(),
str(writeImportancePlot).upper(), str(writePartialDependencePlots).upper(),
plotFileFormat, res, width, height,
pointSize, bg))
# Move the temporary output files to the requested
# location.
File.MoveSilent(tempOutputFile, outputModelFile,
overwriteExisting=overwriteExisting)
outputFilePrefix = os.path.splitext(outputModelFile)[0]
if writeSummaryFile:
File.MoveSilent(os.path.join(tempDir.Path,
u'output_summary.txt'), outputFilePrefix + u'_summary.txt',
overwriteExisting=overwriteExisting)
if rPackage == u'randomForest' and writeImportancePlot:
if os.path.isfile(os.path.join(tempDir.Path,
u'output_importance.%s' % plotFileFormat)):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_importance.%s' % plotFileFormat), outputFilePrefix +
u'_importance.%s' % plotFileFormat, overwriteExisting=overwriteExisting)
if classes is not None:
for c in classes:
if os.path.isfile(os.path.join(tempDir.Path,
u'output_importance_class_%s.%s' % (c, plotFileFormat))):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_importance_class_%s.%s' % (c, plotFileFormat)), outputFilePrefix +
u'_importance_class_%s.%s' % (c, plotFileFormat),
overwriteExisting=overwriteExisting)
if rPackage == u'randomForest' and writePartialDependencePlots:
for term in terms:
if classes is not None:
for c in classes:
if os.path.isfile(os.path.join(tempDir.Path,
u'output_pd_class_%s_%s.%s' % (c, term, plotFileFormat))):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_pd_class_%s_%s.%s' % (c, term, plotFileFormat)), outputFilePrefix +
u'_pd_class_%s_%s.%s' % (c, term, plotFileFormat),
overwriteExisting=overwriteExisting)
else:
if os.path.isfile(os.path.join(tempDir.Path,
u'output_pd_%s.%s' % (term, plotFileFormat))):
File.MoveSilent(os.path.join(tempDir.Path,
u'output_pd_%s.%s' % (term, plotFileFormat)), outputFilePrefix + u'_pd_%s.%s'
% (term, plotFileFormat), overwriteExisting=overwriteExisting)
# Delete the data frame.
finally:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
@classmethod
def PredictFromArcGISTable(cls, inputModelFile, inputTable=None,
predictedValuesField=None, cutoff=None,
where=None, ignoreOutOfRangeValues=True,
noDataValue=None,
outputPlotFile=None, measure1=u'tpr',
measure2=u'fpr', colorize=True,
outputSummaryFile=None, res=1000.,
width=3000., height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
return _PredictFromArcGISTable('randomForest', _(u'Fit Random Forest
Model'), inputModelFile, inputTable, predictedValuesField, cutoff, where,
ignoreOutOfRangeValues, noDataValue, outputPlotFile, measure1, measure2,
colorize, outputSummaryFile, res, width, height, pointSize, bg,
overwriteExisting)
@classmethod
def PredictFromArcGISRasters(cls, inputModelFile, outputResponseRaster,
templateRaster=None, rasterPredictorNames=None, predictorRasters=None,
constantPredictorNames=None, constantPredictorValues=None, cutoff=None,
resamplingTechniques=None, ignoreOutOfRangeValues=True, buildPyramids=False,
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
_PredictFromArcGISRasters('randomForest', _(u'Fit Random Forest
Model'), inputModelFile, outputResponseRaster, cutoff,
constantPredictorNames, constantPredictorValues, rasterPredictorNames,
predictorRasters, templateRaster, resamplingTechniques,
ignoreOutOfRangeValues, None, buildPyramids, overwriteExisting)
class ModelEvaluation(object):
__doc__ = DynamicDocString()
@classmethod
def PlotPerformanceOfBinaryClassificationModel(cls, inputModelFile,
measure1, measure2=None, summaryStats=None, title=None,
evaluationDataTable=None,
where=None,
outputPlotFile=None,
res=1000., width=3000., height=3000., pointSize=10.0, bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# If the caller requested emf format for the plot file,
# convert the width and height from thousands of an inch to
# inches.
if outputPlotFile is not None and
outputPlotFile.lower().endswith(u'.emf'):
width = width / 1000
height = height / 1000
# Load the fitted model.
r = R.GetInterpreter()
dataFrameName = None
newPredictionsName = None
r('load("%s")' % inputModelFile.replace('\\', '\\\\'))
try:
if not r('exists("model")'):
Logger.RaiseException(ValueError(_('The input model file
%(file)s does not contain a variable called "model", indicating that it was
not generated by the one of the MGET model fitting tools, such as Fit GLM.
Please provide a file that was generated by the one of the MGET model fitting
tools.') % {u'file': inputModelFile}))
# If the model required an R package, load it.
if r('exists("rPackage")'):
dependency = RPackageDependency(unicode(r['rPackage']))
dependency.Initialize()
# If the caller specified a evaluation data table, load it
# and create new predictions.
if r('exists("rPackage")') and r['rPackage'] == 'rpart':
if r('model$method').lower() != 'class':
Logger.RaiseException(ValueError(_('The input model file
%(file)s contains a tree model that was fitted with the "%(method)s"
splitting method. Models fitted with that method are not supported by this
tool. This tool only supports models fitted with the "class" method.') %
{u'file': inputModelFile, u'method': r('model$method')}))
if r('length(attr(model, "ylevels"))') != 2:
Logger.RaiseException(ValueError(_('The classification
tree model in %(file)s contains %(classes)i classes. This tool only supports
models with two classes.') % {u'file': inputModelFile, u'classes':
r('length(attr(model, "ylevels"))')}))
if evaluationDataTable is None:
predictedValues = 'as.vector(predict(model,
type="prob")[,2])'
actualValues = 'as.vector(model$y) - 1'
else:
dataFrameName, newPredictionsName =
cls._PredictionsForArcGISTable(inputModelFile, 'model', evaluationDataTable,
where)
predictedValues = newPredictionsName
actualValues = 'as.vector(unclass(factor(%s$%s,
levels=attr(model, "ylevels")))) - 1' % (dataFrameName,
r('all.vars(attr(model$terms, "variables")[[2]])[[1]]'))
else:
if evaluationDataTable is None:
predictedValues = 'model$fitted'
actualValues = 'model$y'
else:
dataFrameName, newPredictionsName =
cls._PredictionsForArcGISTable(inputModelFile, 'model', evaluationDataTable,
where)
predictedValues = newPredictionsName
actualValues = 'na.omit(%s)$%s' % (dataFrameName,
r('all.vars(attr(terms(model$formula), "variables")[[2]])[[1]]'))
# Create the plot.
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'PerfPlotForBinaryModel.r'), False)
if measure2 is None:
measure2 = u'cutoff'
if title is not None:
title = '"' + title.replace('"', '\\"') + '"'
else:
title = 'NULL'
r['summaryStats'] = summaryStats
if outputPlotFile is None:
summaryStatValues =
r('PerfPlotForBinaryModel(predictedValues=%s, actualValues=%s, measure1="%s",
measure2="%s", summaryStats=summaryStats, .title=%s)' % (predictedValues,
actualValues, measure1, measure2, title))
else:
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'plot' +
os.path.splitext(outputPlotFile)[1])
Logger.Info(_(u'Writing plot to %(file)s...') % {u'file':
outputPlotFile})
summaryStatValues = r('PerfPlotForBinaryModelToFile("%s",
predictedValues=%s, actualValues=%s, measure1="%s", measure2="%s",
summaryStats=summaryStats, .title=%s, res=%f, width=%f, height=%f,
pointsize=%f, bg="%s")' % (tempOutputFile.replace('\\', '\\\\'),
predictedValues, actualValues, measure1, measure2, title, res, width, height,
pointSize, bg))
File.MoveSilent(tempOutputFile, outputPlotFile,
overwriteExisting=overwriteExisting)
# Delete R variables assigned by this function.
finally:
r('if (exists("summaryStats")) rm("summaryStats")')
r('if (exists("model")) rm("model")')
r('if (exists("rPackage")) rm("rPackage")')
r('if (exists("xVar")) rm("xVar")')
r('if (exists("yVar")) rm("yVar")')
r('if (exists("zVar")) rm("zVar")')
r('if (exists("mVar")) rm("mVar")')
r('if (exists("coordinateSystem")) rm("coordinateSystem")')
r('if (exists("f20985982305")) rm("f20985982305",
pos=globalenv())')
r('if (exists("data20985982305")) rm("data20985982305",
pos=globalenv())')
r('if (exists("fam20985982305")) rm("fam20985982305",
pos=globalenv())')
if dataFrameName is not None:
r('if (exists("%s")) rm("%s")' % (dataFrameName,
dataFrameName))
if newPredictionsName is not None:
r('if (exists("%s")) rm("%s")' % (newPredictionsName,
newPredictionsName))
# Return the summary statistics' values.
return summaryStatValues
@classmethod
def PlotROCOfBinaryClassificationModel(cls, inputModelFile,
cutoff=u'Automatic', cutoffValue=None, colorize=True, title=None,
evaluationDataTable=None,
where=None,
outputSummaryFile=None,
outputPlotFile=None, res=1000., width=3000., height=3000., pointSize=10.0,
bg=u'white',
overwriteExisting=False):
cls.__doc__.Obj.ValidateMethodInvocation()
# Perform additional parameter validation.
if cutoff == u'specified' and cutoffValue is None:
Logger.RaiseException(ValueError(_(u'When you provide the value
"Specified" for the Cutoff parameter, you must also provide a value for the
Cutoff Value parameter.')))
# If the caller requested emf format for the plot file,
# convert the width and height from thousands of an inch to
# inches.
if outputPlotFile is not None and
outputPlotFile.lower().endswith(u'.emf'):
width = width / 1000
height = height / 1000
# Load the fitted model.
r = R.GetInterpreter()
dataFrameName = None
newPredictionsName = None
r('load("%s")' % inputModelFile.replace('\\', '\\\\'))
try:
if not r('exists("model")'):
Logger.RaiseException(ValueError(_('The input model file
%(file)s does not contain a variable called "model", indicating that it was
not generated by the one of the MGET model fitting tools, such as Fit GLM.
Please provide a file that was generated by the one of the MGET model fitting
tools.') % {u'file': inputModelFile}))
# If the model required an R package, load it.
if r('exists("rPackage")'):
dependency = RPackageDependency(unicode(r['rPackage']))
dependency.Initialize()
# If the caller specified a evaluation data table, load it
# and create new predictions.
if r('exists("rPackage")') and r['rPackage'] == 'rpart':
if r('model$method').lower() != 'class':
Logger.RaiseException(ValueError(_('The input model file
%(file)s contains a tree model that was fitted with the "%(method)s"
splitting method. Models fitted with that method are not supported by this
tool. This tool only supports models fitted with the "class" method.') %
{u'file': inputModelFile, u'method': r('model$method')}))
if r('length(attr(model, "ylevels"))') != 2:
Logger.RaiseException(ValueError(_('The classification
tree model in %(file)s contains %(classes)i classes. This tool only supports
models with two classes.') % {u'file': inputModelFile, u'classes':
r('length(attr(model, "ylevels"))')}))
if evaluationDataTable is None:
predictedValues = 'as.vector(predict(model,
type="prob")[,2])'
actualValues = 'as.vector(model$y) - 1'
else:
dataFrameName, newPredictionsName =
cls._PredictionsForArcGISTable(inputModelFile, 'model', evaluationDataTable,
where)
predictedValues = newPredictionsName
actualValues = 'as.vector(unclass(factor(%s$%s,
levels=attr(model, "ylevels")))) - 1' % (dataFrameName,
r('all.vars(attr(model$terms, "variables")[[2]])[[1]]'))
else:
if evaluationDataTable is None:
predictedValues = 'model$fitted'
actualValues = 'model$y'
else:
dataFrameName, newPredictionsName =
cls._PredictionsForArcGISTable(inputModelFile, 'model', evaluationDataTable,
where)
predictedValues = newPredictionsName
actualValues = 'na.omit(%s)$%s' % (dataFrameName,
r('all.vars(attr(terms(model$formula), "variables")[[2]])[[1]]'))
# Create the plot.
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[cls.__module__].__file__),
'ROCPlotForBinaryModel.r'), False)
if cutoffValue is None:
cutoffValue = 0.0
if title is not None:
title = '"' + title.replace('"', '\\"') + '"'
else:
title = 'NULL'
if outputSummaryFile is not None:
outputSummaryFile = '"' + outputSummaryFile.replace('\\',
'\\\\') + '"'
else:
outputSummaryFile = 'NULL'
if outputPlotFile is None:
outputCutoff, tp, fp, fn, tn, auc, mxe, prbe, rmse =
r('ROCPlotForBinaryModel(predictedValues=%s, actualValues=%s, cutoff="%s",
cutoffValue=%f, colorize=%s, .title=%s, summaryFile=%s)' % (predictedValues,
actualValues, cutoff, cutoffValue, str(colorize).upper(), title,
outputSummaryFile))
else:
tempDir = TemporaryDirectory()
tempOutputFile = os.path.join(tempDir.Path, u'plot' +
os.path.splitext(outputPlotFile)[1])
Logger.Info(_(u'Writing plot to %(file)s...') % {u'file':
outputPlotFile})
outputCutoff, tp, fp, fn, tn, auc, mxe, prbe, rmse =
r('ROCPlotForBinaryModelToFile("%s", predictedValues=%s, actualValues=%s,
cutoff="%s", cutoffValue=%f, colorize=%s, .title=%s, summaryFile=%s, res=%f,
width=%f, height=%f, pointsize=%f, bg="%s")' % (tempOutputFile.replace('\\',
'\\\\'), predictedValues, actualValues, cutoff, cutoffValue,
str(colorize).upper(), title, outputSummaryFile, res, width, height,
pointSize, bg))
File.MoveSilent(tempOutputFile, outputPlotFile,
overwriteExisting=overwriteExisting)
# Delete R variables assigned by this function.
finally:
r('if (exists("model")) rm("model")')
r('if (exists("rPackage")) rm("rPackage")')
r('if (exists("xVar")) rm("xVar")')
r('if (exists("yVar")) rm("yVar")')
r('if (exists("zVar")) rm("zVar")')
r('if (exists("mVar")) rm("mVar")')
r('if (exists("coordinateSystem")) rm("coordinateSystem")')
r('if (exists("f20985982305")) rm("f20985982305",
pos=globalenv())')
r('if (exists("data20985982305")) rm("data20985982305",
pos=globalenv())')
r('if (exists("fam20985982305")) rm("fam20985982305",
pos=globalenv())')
if dataFrameName is not None:
r('if (exists("%s")) rm("%s")' % (dataFrameName,
dataFrameName))
if newPredictionsName is not None:
r('if (exists("%s")) rm("%s")' % (newPredictionsName,
newPredictionsName))
# Return successfully.
return outputCutoff, tp, fp, fn, tn, auc, mxe, prbe, rmse
@classmethod
def _PredictionsForArcGISTable(cls, inputModelFile, modelName,
evaluationDataTable, where):
# First verify that the table contains all of the fields used
# to fit the model.
r = R.GetInterpreter()
gp = GeoprocessorManager.GetWrappedGeoprocessor()
fields = []
xColumnName = None
yColumnName = None
zColumnName = None
mColumnName = None
d = gp.Describe(evaluationDataTable)
gotPointFeatures = hasattr(d, u'ShapeType') and
isinstance(d.ShapeType, basestring) and d.ShapeType.lower() == u'point'
if r('exists("rPackage")') and r['rPackage'] == 'rpart': #
TODO: add support for randomForest and party
allVars = r('all.vars(%s$terms)' % modelName)
else:
allVars = r('all.vars(%s$formula)' % modelName)
if isinstance(allVars, basestring):
allVars = [allVars]
for var in allVars:
conn = ArcGIS91DatabaseConnection()
if conn.FieldExists(evaluationDataTable, var):
fields.append(var)
else:
if var == r['xVar']:
if gotPointFeatures:
xColumnName = var
else:
Logger.RaiseException(ValueError(_('The input model
in %(file)s was fitted to a point feature class and the point X coordinate
was used as the variable "%(var)s" in the model, but the evaluation data
table %(table)s does not contain point features, nor does it contain a field
named "%(var)s". For the evaluation data, you must provide either a point
feature class or a table that contains a field named "%(var)s".') % {u'file':
inputModelFile, u'table': evaluationDataTable, u'var': var}))
elif var == r['yVar']:
if gotPointFeatures:
yColumnName = var
else:
Logger.RaiseException(ValueError(_('The input model
in %(file)s was fitted to a point feature class and the point Y coordinate
was used as the variable "%(var)s" in the model, but the evaluation data
table %(table)s does not contain point features, nor does it contain a field
named "%(var)s". For the evaluation data, you must provide either a point
feature class or a table that contains a field named "%(var)s".') % {u'file':
inputModelFile, u'table': evaluationDataTable, u'var': var}))
elif var == r['zVar']:
if gotPointFeatures and d.HasZ:
zColumnName = var
elif gotPointFeatures:
Logger.RaiseException(ValueError(_('The input model
in %(file)s was fitted to a point feature class and the point Z coordinate
was used as the variable "%(var)s" in the model, but the point feature class
%(table)s provided as the evaluation data table does not have Z coordinates,
nor does it contain a field named "%(var)s". For the evaluation data, you
must provide either a point feature class that has Z values or a table that
contains a field named "%(var)s".') % {u'file': inputModelFile, u'table':
evaluationDataTable, u'var': var}))
else:
Logger.RaiseException(ValueError(_('The input model
in %(file)s was fitted to a point feature class and the point Z coordinate
was used as the variable "%(var)s" in the model, but the evaluation data
table %(table)s does not contain point features, nor does it contain a field
named "%(var)s". For the evaluation data, you must provide either a point
feature class that has Z values or a table that contains a field named
"%(var)s".') % {u'file': inputModelFile, u'table': evaluationDataTable,
u'var': var}))
elif var == r['mVar']:
if gotPointFeatures and d.HasM:
mColumnName = var
elif gotPointFeatures:
Logger.RaiseException(ValueError(_('The input model
in %(file)s was fitted to a point feature class and the point M value was
used as the variable "%(var)s" in the model, but the point feature class
%(table)s provided as the evaluation data table does not have M values, nor
does it contain a field named "%(var)s". For the evaluation data, you must
provide either a point feature class that has M values or a table that
contains a field named "%(var)s".') % {u'file': inputModelFile, u'table':
evaluationDataTable, u'var': var}))
else:
Logger.RaiseException(ValueError(_('The input model
in %(file)s was fitted to a point feature class and the point M value was
used as the variable "%(var)s" in the model, but the evaluation data table
%(table)s does not contain point features, nor does it contain a field named
"%(var)s". For the evaluation data, you must provide either a point feature
class that has M values or a table that contains a field named "%(var)s".') %
{u'file': inputModelFile, u'table': evaluationDataTable, u'var': var}))
else:
Logger.RaiseException(ValueError(_('The input model in
%(file)s was fitted to a table containing a field named "%(var)s" but the
evaluation data table %(table)s does not contain this field. You must provide
a table that contains this field.') % {u'file': inputModelFile, u'table':
evaluationDataTable, u'var': var}))
# Load the table into a temporary data frame.
dataFrameName = R.GetUniqueVariableName()
R.LoadDataFrameFromArcGISTable(evaluationDataTable, dataFrameName,
fields=fields, where=where, xColumnName=xColumnName, yColumnName=yColumnName,
zColumnName=zColumnName, mColumnName=mColumnName)
if r('length(%s)' % dataFrameName) <= 0:
if where is not None:
Logger.RaiseException(ValueError(_(u'The where clause
"%(where)s" did not select any rows from the table %(table)s.') % {u'table':
evaluationDataTable, u'where': where}))
else:
Logger.RaiseException(ValueError(_(u'The table %(table)s is
empty.') % {u'table': evaluationDataTable}))
# Create predictions from the evaluation data.
try:
Logger.Info(_(u'Predicting the response for the evaluation
data...'))
r('if (exists("rPackage")) library(rPackage,
character.only=TRUE)')
newPredictionsName = R.GetUniqueVariableName()
if r('exists("rPackage")') and r['rPackage'] == 'rpart':
r('%s <- as.vector(predict(%s, newdata=%s, type="prob")[,2])'
% (newPredictionsName, modelName, dataFrameName))
else:
r('%s <- as.vector(predict(%s, newdata=na.omit(%s),
type="response"))' % (newPredictionsName, modelName, dataFrameName))
except:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
r('if (exists("%s")) rm("%s")' % (newPredictionsName,
newPredictionsName))
raise
# Return the names of the R variables we created for the data
# frame and the predictions.
return dataFrameName, newPredictionsName
@classmethod
def RandomlySplitArcGISTableIntoTrainingAndTestRecords(cls, table,
percentTest=33.33333, field=u'TestData', where=None, seed=None):
cls.__doc__.Obj.ValidateMethodInvocation()
# If the field already exists, validate that it is the correct
# type.
conn = ArcGIS91DatabaseConnection()
dataType = conn.GetFieldDataType(table, field)
if dataType is not None:
if dataType not in [u'SHORT', u'LONG', u'FLOAT', u'DOUBLE']:
Logger.RaiseException(ValueError(_(u'Existing field %(field)s
of table %(table)s cannot be used because it has the data type %(dt)s. Please
specify a field that has the data type SHORT, LONG, FLOAT, or DOUBLE.') %
{u'field': field, u'table': table, u'dt': dataType}))
# If the field does not already exist, create it.
else:
conn.AddField(table, field, u'SHORT')
Logger.Info(_(u'Added field %(field)s to table %(table)s.') %
{u'field': field, u'table': table})
# If the caller provided a where clause, create a table view.
gp = GeoprocessorManager.GetWrappedGeoprocessor()
if where is None:
tableView = table
else:
tableView = GeoprocessorManager.GetUniqueLayerName()
gp.MakeTableView_management(table, tableView, where)
try:
# If there were no rows, warn the user.
totalRows = conn.GetRowCount(tableView)
if totalRows <= 0:
if where is not None:
Logger.Warning(_(u'No rows of table %(table)s were
changed because the where clause "%(where)s" did not select any rows from the
table.') % {u'table': table, u'where': where})
else:
Logger.Warning(_(u'Table %(table)s is empty.') %
{u'table': table})
# Otherwise, compute the number of rows that should be
# designated training and evaluation, create a shuffled
# list of 0 and 1 values to draw from, and designate the
# rows.
else:
if seed is not None:
random.seed(seed)
evaluationRows = int(round(totalRows * percentTest / 100.0))
trainingRows = totalRows - evaluationRows
values = [0] * trainingRows + [1] * evaluationRows
random.shuffle(values)
Logger.Info(_(u'Updating table %(table)s: designating %(t)i
rows as training records and %(e)i rows as test records...') % {u'table':
table, u't': trainingRows, u'e': evaluationRows})
i = 0
d = gp.Describe(tableView)
if hasattr(d, 'OIDFieldName') and isinstance(d.OIDFieldName,
basestring):
cur = conn.OpenUpdateCursor(tableView,
orderBy=[d.OIDFieldName], directions=['Ascending'], rowCount=totalRows)
# This guarantees the rows are processed in the same order every time, so
that the seed parameter produce the desired effect.
else:
cur = conn.OpenUpdateCursor(tableView, rowCount=totalRows)
try:
while cur.NextRow():
cur.SetValue(field, values[i])
cur.UpdateRow()
i += 1
if i >= len(values): # This should never happen
break
finally:
del cur
# Delete the table view, if we created one.
finally:
try:
if tableView != table and
GeoprocessorManager.GetArcGISMajorVersion() == 9 and
GeoprocessorManager.GetArcGISMajorVersion() == 1:
gp.Delete(tableView)
except:
pass
# Return successfully.
return table
# Private helper functions
def _LoadDataFrameForModelFitting(inputTable, formulas, formulaIsRequired,
family, link, variance, theta, where, xColumnName, yColumnName, zColumnName,
mColumnName, extraFieldsToLoad=None):
# Determine if the input table is a point feature class.
gp = GeoprocessorManager.GetWrappedGeoprocessor()
d = gp.Describe(inputTable)
gotPointFeatures = hasattr(d, u'ShapeType') and isinstance(d.ShapeType,
basestring) and d.ShapeType.lower() == u'point'
# Create a list of all of the fields in the formula.
r = R.GetInterpreter()
fields = set()
for name, formula in formulas.items():
if formula is not None:
terms = r('all.vars(%s)' % formula)
if isinstance(terms, types.ListType):
fields = fields.union(set(terms))
elif isinstance(terms, basestring):
fields = fields.union(set([terms]))
elif formulaIsRequired[name]:
Logger.RaiseException(ValueError(_(u'The %(name)s
"%(formula)s" did not include any terms. Please provide a %(name)s that
includes at least one term.') % {u'name': name, u'formula': formula}))
if extraFieldsToLoad is not None:
fields = fields.union(set(extraFieldsToLoad))
fields = list(fields)
# If the caller provided an X, Y, Z, or M column name and it
# appears in a formula, report an error if the input table is not
# a point feature class.
if xColumnName is not None and xColumnName in fields:
if gotPointFeatures:
if xColumnName == u'x' or xColumnName == u'X':
Logger.RaiseException(ValueError(_(u'The variable
representing the X coordinates of point features cannot be named "x". We
apologize for this inconvenient restriction. Please specify a different
name.'))) # Imposed by rgdal
fields.remove(xColumnName)
else:
Logger.RaiseException(ValueError(_(u'The variable "%(var)s" is
designated as the X coordinate of point features, but the input table
"%(table)s" does not contain point features. Please specify a table that
includes point features or remove this variable from the model.') % {u'var':
xColumnName, u'table': inputTable}))
else:
xColumnName = None
if yColumnName is not None and yColumnName in fields:
if gotPointFeatures:
if yColumnName == u'y' or yColumnName == u'Y':
Logger.RaiseException(ValueError(_(u'The variable
representing the Y coordinates of point features cannot be named "y". We
apologize for this inconvenient restriction. Please specify a different
name.'))) # Imposed by rgdal
fields.remove(yColumnName)
else:
Logger.RaiseException(ValueError(_(u'The variable "%(var)s" is
designated as the Y coordinate of point features, but the input table
"%(table)s" does not contain point features. Please specify a table that
includes point features or remove this variable from the model.') % {u'var':
yColumnName, u'table': inputTable}))
else:
yColumnName = None
if zColumnName is not None and zColumnName in fields:
if gotPointFeatures:
if d.HasZ:
fields.remove(zColumnName)
else:
Logger.RaiseException(ValueError(_(u'The variable "%(var)s"
is designated as the Z coordinate of point features, but the feature class
"%(table)s" does not have Z coordinates. Please specify a feature class that
includes Z coordinates or remove this variable from the model.') % {u'var':
zColumnName, u'table': inputTable}))
else:
Logger.RaiseException(ValueError(_(u'The variable "%(var)s" is
designated as the X coordinate of point features, but the input table
"%(table)s" does not contain point features. Please specify a table that
includes point features or remove this variable from the model.') % {u'var':
zColumnName, u'table': inputTable}))
else:
zColumnName = None
if mColumnName is not None and mColumnName in fields:
if gotPointFeatures:
if d.HasZ:
fields.remove(mColumnName)
else:
Logger.RaiseException(ValueError(_(u'The variable "%(var)s"
is designated as the M value of point features, but the feature class
"%(table)s" does not have M values. Please specify a feature class that
includes M values or remove this variable from the model.') % {u'var':
mColumnName, u'table': inputTable}))
else:
Logger.RaiseException(ValueError(_(u'The variable "%(var)s" is
designated as the M value of point features, but the input table "%(table)s"
does not contain point features. Please specify a table that includes point
features or remove this variable from the model.') % {u'var': mColumnName,
u'table': inputTable}))
else:
mColumnName = None
if u'x' in fields or u'X' in fields or u'y' in fields or u'Y' in fields:
Logger.RaiseException(ValueError(_(u'The model cannot reference a
field named "x" or "y". We apologize for this inconvenient restriction. To
use this field in the model, rename it to something other than "x" or
"y".'))) # Imposed by rgdal
# Load the table into an R data frame.
dataFrameName = R.GetUniqueVariableName()
R.LoadDataFrameFromArcGISTable(inputTable, dataFrameName, fields=fields,
where=where, xColumnName=xColumnName, yColumnName=yColumnName,
zColumnName=zColumnName, mColumnName=mColumnName)
try:
# Validate that the data frame contains at least one row.
if r('length(%s)' % dataFrameName) <= 0:
if where is not None:
Logger.RaiseException(ValueError(_(u'The where clause
"%(where)s" did not select any rows from the table %(table)s.') % {u'table':
inputTable, u'where': where}))
else:
Logger.RaiseException(ValueError(_(u'The table %(table)s is
empty.') % {u'table': inputTable}))
# Prepare the column names to be passed as parameters to the R
# model-fitting function.
if xColumnName is not None:
xColumnParam = u'"' + xColumnName + u'"'
else:
xColumnParam = u'NULL'
if yColumnName is not None:
yColumnParam = u'"' + yColumnName + u'"'
else:
yColumnParam = u'NULL'
if zColumnName is not None:
zColumnParam = u'"' + zColumnName + u'"'
else:
zColumnParam = u'NULL'
if mColumnName is not None:
mColumnParam = u'"' + mColumnName + u'"'
else:
mColumnParam = u'NULL'
# If the input table is a point feature class, prepare its
# coordinate system string to be passed as a parameter to the
# R model-fitting function.
if gotPointFeatures:
coordinateSystemParam = u'"' +
gp.CreateSpatialReference_management(d.SpatialReference).split(u';')[0] + u'"'
else:
coordinateSystemParam = u'NULL'
# Create a family string to be passed as a parameter to the R
# model-fitting function.
if family is not None:
if link is None:
if family == u'binomial':
link = u'logit'
elif family == u'gaussian':
link = u'identity'
elif family == u'Gamma':
link = u'inverse'
elif family == u'inverse.gaussian':
link = u'1/mu^2'
elif family == u'negbin':
link = u'log'
elif family == u'poisson':
link = u'log'
elif family == u'quasi':
link = u'identity'
elif family == u'quasibinomial':
link = u'logit'
elif family == u'quasipoisson':
link = u'log'
else:
Logger.RaiseException(_(u'Programming error in this tool:
unknown model family "%s". Please contact the author of this tool for
assistance.') % family)
if family == u'negbin':
familyParam = '%s(theta=%s, link="%s")' % (family, theta,
link)
elif family == u'quasi':
if variance is None:
variance = u'constant'
familyParam = '%s(link="%s", variance="%s")' % (family, link,
variance)
else:
familyParam = '%s(link="%s")' % (family, link)
else:
familyParam = None
except:
r('rm("%s")' % dataFrameName)
raise
# Return successfully.
return dataFrameName, xColumnParam, yColumnParam, zColumnParam,
mColumnParam, coordinateSystemParam, familyParam
def _PredictFromArcGISTable(modelType, fitToolName, inputModelFile,
inputTable, predictedValuesField, cutoff, where, ignoreOutOfRangeValues,
noDataValue, outputPlotFile, measure1, measure2, colorize, outputSummaryFile,
res, width, height, pointSize, bg, overwriteExisting):
# If the caller requested emf format for the plot file,
# convert the width and height from thousands of an inch to
# inches.
if outputPlotFile is not None and
outputPlotFile.lower().endswith(u'.emf'):
width = width / 1000
height = height / 1000
# Load the fitted model.
dataFrameName = None
r = R.GetInterpreter()
r('load("%s")' % inputModelFile.replace('\\', '\\\\'))
try:
if not r('exists("model")'):
Logger.RaiseException(ValueError(_('The input model file %(file)s
does not contain a variable called "model", indicating that it was not
generated by the Fit %(type)s tool. Please provide a file that was generated
by the %(fitToolName)s tool.') % {u'file': inputModelFile, u'fitToolName':
fitToolName}))
if modelType.lower() not in r('tolower(class(model))'):
Logger.RaiseException(ValueError(_('The input model file %(file)s
contains a variable called "model" but it is not a %(type)s, indicating that
it was not generated by the %(fitToolName)s tool. Please provide a file that
was generated by the %(fitToolName)s tool.') % {u'file': inputModelFile,
u'type': modelType, u'fitToolName': fitToolName}))
if r('exists("rPackage")'):
Logger.Info(_(u'Loaded %(type)s from %(file)s. The model was
fitted with the R %(pkg)s package.') % {u'type': r('class(model)[1]'),
u'file': inputModelFile, u'pkg': r['rPackage']})
else:
Logger.Info(_(u'Loaded %(type)s from %(file)s.') % {u'type':
r('class(model)[1]'), u'file': inputModelFile, })
# If the model required an R package, load it.
if r('exists("rPackage")'):
dependency = RPackageDependency(unicode(r['rPackage']))
dependency.Initialize()
# If it is a GAM fitted by the R gam package, we have to work
# around a bug in predict.gam. The situation is described in
# more detail in a message to the R help mailing list titled
# "use of step.gam (from package 'gam') and superassignment
# inside functions" dated 28-Feb-2008 (see
# https://stat.ethz.ch/pipermail/r-help/2008-February/155586.html).
# Define uniquely-named global variables for the formula, data
# and family parameters. This is very hacky but it is the only
# way I could get it working. Note the use of the <<-
# operator. Hopefully these names are unique.
if modelType.lower() == 'gam' and r['rPackage'] == 'gam':
r('f20985982305 <<- model$formula')
r('data20985982305 <<- model$data')
r('fam20985982305 <<- model$family')
# If the caller provided a table, load it into a temporary
# data frame.
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[GLM.__module__].__file__),
'Utils.r'), False)
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[GLM.__module__].__file__),
'PredictModel.r'), False)
if r('exists("rPackage")'):
rPackage = '"' + r['rPackage'] + '"'
else:
rPackage = 'NULL'
if r('exists("rPackage")') and r['rPackage'] in ['randomForest',
'party']:
trainingData = 'trainingData'
else:
trainingData = 'model$data'
if inputTable is not None:
# First verify that the table contains fields for all of
# the predictor variables in the model.
from GeoEco.Datasets.ArcGIS import ArcGISTable
table = ArcGISTable(inputTable)
gotPointFeatures = table.GeometryType in [u'Point', u'Point25D']
fields = []
xColumnName = None
yColumnName = None
zColumnName = None
mColumnName = None
responseVariable, allPredictors, minResponseValue,
maxResponseValue, isBinaryClassification, isNonbinaryClassification,
minFittedPredictorValues, maxFittedPredictorValues, categoricalPredictors,
categoricalPredictorLevels, warnedAboutMissingLevels,
allValuesForPredictorAreNA, allValuesAreNA = r('PrepareForPrediction(model,
%s, %s)' % (trainingData, rPackage))
if isinstance(allPredictors, basestring): # rpy will return
a vector of one string as a Python string, rather than a Python list
containing one Python string
allPredictors = [allPredictors]
for var in allPredictors:
field = table.GetFieldByName(var)
if field is not None:
fields.append(var)
elif var == r['xVar']:
if table.GeometryType in [u'Point', u'Point25D']:
xColumnName = var
else:
Logger.RaiseException(ValueError(_('The model in
%(file)s was fitted to a point feature class and the point X coordinate was
used as the "%(var)s" variable in the model, but the table %(table)s does not
contain point features, nor does it contain a field named "%(var)s". Please
provide either a point feature class, or a table that contains a field named
"%(var)s".') % {u'file': inputModelFile, u'table': inputTable, u'var': var}))
elif var == r['yVar']:
if table.GeometryType in [u'Point', u'Point25D']:
yColumnName = var
else:
Logger.RaiseException(ValueError(_('The model in
%(file)s was fitted to a point feature class and the point Y coordinate was
used as the "%(var)s" variable in the model, but the table %(table)s does not
contain point features, nor does it contain a field named "%(var)s". Please
provide either a point feature class, or a table that contains a field named
"%(var)s".') % {u'file': inputModelFile, u'table': inputTable, u'var': var}))
elif var == r['zVar']:
if table.GeometryType == u'Point25D':
zColumnName = var
elif table.GeometryType == u'Point':
Logger.RaiseException(ValueError(_('The model in
%(file)s was fitted to a point feature class and the point Z coordinate was
used as the "%(var)s" variable in the model, but the point feature class
%(table)s provided as the table does not have Z coordinates, nor does it
contain a field named "%(var)s". Please provide either a point feature class
that has Z values, or a table that contains a field named "%(var)s".') %
{u'file': inputModelFile, u'table': inputTable, u'var': var}))
else:
Logger.RaiseException(ValueError(_('The model in
%(file)s was fitted to a point feature class and the point Z coordinate was
used as the "%(var)s" variable in the model, but the table %(table)s does not
contain point features, nor does it contain a field named "%(var)s". Please
provide either a point feature class that has Z values, or a table that
contains a field named "%(var)s".') % {u'file': inputModelFile, u'table':
inputTable, u'var': var}))
elif var == r['mVar']:
if table.GeometryType in [u'Point', u'Point25D']:
gp = GeoprocessorManager.GetWrappedGeoprocessor()
if gp.Describe(inputTable).HasM:
mColumnName = var
else:
Logger.RaiseException(ValueError(_('The model in
%(file)s was fitted to a point feature class and the point M value was used
as the "%(var)s" variable in the model, but the point feature class %(table)s
does not have M values, nor does it contain a field named "%(var)s". Please
provide either a point feature class that has M values, or a table that
contains a field named "%(var)s".') % {u'file': inputModelFile, u'table':
inputTable, u'var': var}))
else:
Logger.RaiseException(ValueError(_('The model in
%(file)s was fitted to a point feature class and the point M value was used
as the "%(var)s" variable in the model, but the table %(table)s does not
contain point features, nor does it contain a field named "%(var)s". Please
provide either a point feature class that has M values, or a table that
contains a field named "%(var)s".') % {u'file': inputModelFile, u'table':
inputTable, u'var': var}))
else:
Logger.RaiseException(ValueError(_('The model in %(file)s
contains a variable named "%(var)s" but the table %(table)s does not contain
a field with that name. Please provide a table that contains a field with
that name.') % {u'file': inputModelFile, u'table': inputTable, u'var': var}))
# Check whether the table contains a field for the
# response variable.
field = table.GetFieldByName(responseVariable)
if field is not None:
fields.append(responseVariable)
elif predictedValuesField is None:
Logger.RaiseException(ValueError(_('The table %(table)s does
not contain a field named "%(var)s" for the response variable, and you did
not specify a field to receive the predicted values. Thus, there is no
purpose to running this tool: the model performance cannot be assessed
without the response variable, and the predicted results cannot be stored
without a field to receive them. Please provide a table containing a field
for the response variable or designate a field to receive the predicted
response.') % {u'table': inputTable, u'var': var}))
# Load the table into a temporary data frame.
if table.HasOID:
orderBy = [table.OIDFieldName]
directions = [u'Ascending']
else:
orderBy = None
directions = None
dataFrameName = R.GetUniqueVariableName()
R.LoadDataFrameFromArcGISTable(inputTable, dataFrameName,
fields=fields, where=where, orderBy=orderBy, directions=directions,
xColumnName=xColumnName, yColumnName=yColumnName, zColumnName=zColumnName,
mColumnName=mColumnName)
if r('length(%s)' % dataFrameName) <= 0:
if where is not None:
Logger.RaiseException(ValueError(_(u'The where clause
"%(where)s" did not select any rows from the table %(table)s. No predictions
can be done.') % {u'table': inputTable, u'where': where}))
else:
Logger.RaiseException(ValueError(_(u'The table %(table)s
is empty. No predictions can be done.') % {u'table': inputTable}))
# Do the prediction.
tempDir = TemporaryDirectory()
Logger.Info(_(u'Predicting...'))
if inputTable is None:
newData = 'NULL'
else:
newData = dataFrameName
if cutoff is not None:
cutoff = repr(cutoff)
else:
cutoff = 'NULL'
if outputPlotFile is not None:
tempPlotFile = '"' + os.path.join(tempDir.Path, u'plot' +
outputPlotFile[-4:]).replace('\\', '\\\\') + '"'
else:
tempPlotFile = 'NULL'
if measure1 is not None:
measure1 = '"' + measure1 + '"'
else:
measure1 = 'NULL'
if measure2 is not None:
measure2 = '"' + measure2 + '"'
else:
measure2 = 'NULL'
if outputSummaryFile is not None:
tempSummaryFile = '"' + os.path.join(tempDir.Path,
u'summary.txt').replace('\\', '\\\\') + '"'
else:
tempSummaryFile = 'NULL'
(predictedResponse, newCutoff) = r('PredictModelForDataframe(model,
%s, %s, %s, ignoreOutOfRangeValues=%s, cutoff=%s, outputPlotFile=%s,
measure1=%s, measure2=%s, colorize=%s, outputSummaryFile=%s, res=%f,
width=%f, height=%f, pointSize=%f, bg="%s")' %
(rPackage, trainingData, newData,
str(ignoreOutOfRangeValues).upper(), cutoff, tempPlotFile, measure1,
measure2, str(colorize).upper(), tempSummaryFile, res, width, height,
pointSize, bg))
# If the caller provided a table and a field to receive the
# predicted values, store them in the table.
if inputTable is not None and predictedValuesField is not None:
if table.HasOID:
cursor = table.OpenUpdateCursor(where=where,
orderBy=table.OIDFieldName + ' ASC', rowCount=len(predictedResponse))
else:
cursor = table.OpenUpdateCursor(where=where,
rowCount=len(predictedResponse))
try:
Logger.Info(_(u'Writing predictions to field %(field)s for
%(rowCount)i rows of %(table)s.') % {u'field': predictedValuesField,
u'rowCount': len(predictedResponse), u'table': table.DisplayName})
i = 0
while cursor.NextRow():
cursor.SetValue(predictedValuesField,
predictedResponse[i])
cursor.UpdateRow()
i += 1
finally:
del cursor
# Move the temporary output files to the requested location.
if outputPlotFile is not None and os.path.isfile(tempPlotFile[1:-1]):
File.MoveSilent(tempPlotFile[1:-1], outputPlotFile,
overwriteExisting=overwriteExisting)
if outputSummaryFile is not None and
os.path.isfile(tempSummaryFile[1:-1]):
File.MoveSilent(tempSummaryFile[1:-1], outputSummaryFile,
overwriteExisting=overwriteExisting)
# Delete R variables assigned by this function.
finally:
r('if (exists("rastersForPredictors")) rm("rastersForPredictors")')
r('if (exists("model")) rm("model")')
r('if (exists("trainingData")) rm("trainingData")')
r('if (exists("rPackage")) rm("rPackage")')
r('if (exists("xVar")) rm("xVar")')
r('if (exists("yVar")) rm("yVar")')
r('if (exists("zVar")) rm("zVar")')
r('if (exists("mVar")) rm("mVar")')
r('if (exists("coordinateSystem")) rm("coordinateSystem")')
r('if (exists("f20985982305")) rm("f20985982305", pos=globalenv())')
r('if (exists("data20985982305")) rm("data20985982305",
pos=globalenv())')
r('if (exists("fam20985982305")) rm("fam20985982305",
pos=globalenv())')
if dataFrameName is not None:
r('if (exists("%s")) rm("%s")' % (dataFrameName, dataFrameName))
# Return successfully.
return inputTable, newCutoff
def _PredictFromArcGISRasters(modelType, fitToolName, inputModelFile,
outputResponseRaster, cutoff, constantPredictorNames,
constantPredictorValues, rasterPredictorNames, predictorRasters,
templateRaster, resamplingTechniques, ignoreOutOfRangeValues,
outputErrorRaster, buildPyramids, overwriteExisting):
# Perform additional parameter validation.
if constantPredictorNames is None and constantPredictorValues is not None
or constantPredictorNames is not None and constantPredictorValues is None:
Logger.RaiseException(ValueError(_(u'The lists of constant predictor
variables names and values must both be specified, or neither must be
specified.')))
if rasterPredictorNames is None and predictorRasters is not None or
rasterPredictorNames is not None and predictorRasters is None:
Logger.RaiseException(ValueError(_(u'The lists of raster predictor
variable names and rasters must both be specified, or neither must be
specified.')))
if constantPredictorNames is not None and
len(set(constantPredictorNames)) != len(constantPredictorNames):
Logger.RaiseException(ValueError(_(u'The list of constant predictors
must not contain duplicates.')))
if rasterPredictorNames is not None and len(set(rasterPredictorNames)) !=
len(rasterPredictorNames):
Logger.RaiseException(ValueError(_(u'The list of raster predictors
must not contain duplicates.')))
if constantPredictorNames is not None and rasterPredictorNames is not
None and
len(set(constantPredictorNames).intersection(set(rasterPredictorNames))) > 0:
Logger.RaiseException(ValueError(_(u'The same predictor must not
appear in both the list of constant predictors and the list of raster
predictors.')))
gp = GeoprocessorManager.GetWrappedGeoprocessor()
if predictorRasters is not None:
for raster in predictorRasters:
d = gp.Describe(raster)
if d.SpatialReference.Name is None or
d.SpatialReference.Name.lower() == u'unknown':
Logger.RaiseException(ValueError(_(u'The predictor raster
%(raster)s does not have a coordinate system defined. Please define the
coordinate system for this raster and try again. (Use the ArcGIS Define
Projection tool.)') % {u'raster': raster}))
if outputResponseRaster.lower() in map(lambda s: unicode(s).lower(),
predictorRasters):
Logger.RaiseException(ValueError(_(u'The output response raster
%(out)s also appears in the list of input predictor rasters. This is not
allowed. Please specify a different output response raster or remove it from
the list of input predictor rasters.') % {u'out': outputResponseRaster}))
if outputErrorRaster is not None and outputErrorRaster.lower() in
map(lambda s: unicode(s).lower(), predictorRasters):
Logger.RaiseException(ValueError(_(u'The output standard error
raster %(out)s also appears in the list of input predictor rasters. This is
not allowed. Please specify a different output standard error raster or
remove it from the list of input predictor rasters.') % {u'out':
outputResponseRaster}))
# Look up the coordinate system, cell size, and extent of the
# template raster.
if templateRaster is None:
if predictorRasters is None:
Logger.RaiseException(ValueError(_(u'If no predictor rasters are
provided, a template raster must be specified.')))
templateRaster = predictorRasters[0]
templateDescribe = gp.Describe(templateRaster)
if templateDescribe.SpatialReference.Name is None or
templateDescribe.SpatialReference.Name.lower() == u'unknown':
Logger.RaiseException(ValueError(_(u'The template raster %(raster)s
does not have a coordinate system defined. Please define the coordinate
system for this raster and try again. (Use the ArcGIS Define Projection
tool.)') % {u'raster': templateRaster}))
coordinateSystem =
gp.CreateSpatialReference(templateDescribe.SpatialReference).split(';')[0]
cellSize = templateDescribe.MeanCellWidth
extent = templateDescribe.Extent
# Load the fitted model.
r = R.GetInterpreter()
r('load("%s")' % inputModelFile.replace('\\', '\\\\'))
try:
if not r('exists("model")'):
Logger.RaiseException(ValueError(_('The input model file %(file)s
does not contain a variable called "model", indicating that it was not
generated by the Fit %(type)s tool. Please provide a file that was generated
by the %(fitToolName)s tool.') % {u'file': inputModelFile, u'fitToolName':
fitToolName}))
if modelType.lower() not in r('tolower(class(model))'):
Logger.RaiseException(ValueError(_('The input model file %(file)s
contains a variable called "model" but it is not a %(type)s, indicating that
it was not generated by the %(fitToolName)s tool. Please provide a file that
was generated by the %(fitToolName)s tool.') % {u'file': inputModelFile,
u'type': modelType, u'fitToolName': fitToolName}))
if r('exists("rPackage")'):
Logger.Info(_(u'Loaded %(type)s from %(file)s. The model was
fitted with the R %(pkg)s package.') % {u'type': r('class(model)[1]'),
u'file': inputModelFile, u'pkg': r['rPackage']})
else:
Logger.Info(_(u'Loaded %(type)s from %(file)s.') % {u'type':
r('class(model)[1]'), u'file': inputModelFile, })
# If the model required an R package, load it.
if r('exists("rPackage")'):
dependency = RPackageDependency(unicode(r['rPackage']))
dependency.Initialize()
# If it is a GAM fitted by the R gam package, report an error
# if the caller specified an outputErrorRaster, because that
# package's predict.gam function cannot generate predictions
# from the newdata parameter.
if outputErrorRaster is not None and modelType.lower() == 'gam' and
r['rPackage'] == 'gam':
Logger.RaiseException(ValueError(_('The input model from file
%(file)s is a GAM that was fitted using the R gam package. Because the gam
package cannot return standard errors for responses predicted from data other
than those used to fit the model, a standard error raster cannot be produced.
Please do not specify a standard error raster, or, if you must have one,
re-fit the GAM using the R mgcv package.') % {u'file': inputModelFile}))
# If it is a GAM fitted by the R gam package, we have to work
# around a bug in predict.gam. The situation is described in
# more detail in a message to the R help mailing list titled
# "use of step.gam (from package 'gam') and superassignment
# inside functions" dated 28-Feb-2008 (see
# https://stat.ethz.ch/pipermail/r-help/2008-February/155586.html).
# Define uniquely-named global variables for the formula, data
# and family parameters. This is very hacky but it is the only
# way I could get it working. Note the use of the <<-
# operator. Hopefully these names are unique.
if modelType.lower() == 'gam' and r['rPackage'] == 'gam':
r('f20985982305 <<- model$formula')
r('data20985982305 <<- model$data')
r('fam20985982305 <<- model$family')
# If the caller provided a GLM or GAM without any
# predictor variables, then the predicted response and
# standard errors will be constant values. Report a warning
# and create constant rasters having these values.
if (not r('exists("rPackage")') or r['rPackage'] in ['glm', 'gam',
'mgcv']) and isinstance(r('all.vars(model$formula)'), basestring): #
If only one term is in the model (i.e. the response variable), rpy will
return it as a string, rather than a list with one string in it.
if outputErrorRaster is not None:
Logger.Warning(_(u'The model\'s formula does not contain any
predictor variables. Because of this, the output response and standard error
rasters will contain constant values.'))
else:
Logger.Warning(_(u'The model\'s formula does not contain any
predictor variables. Because of this, the output response raster will contain
a constant value.'))
responseValue = r('predict(model, newdata=data.frame(0),
type="response")')
gp.CreateRandomRaster_management(os.path.dirname(outputResponseRaster),
os.path.basename(outputResponseRaster), u'UNIFORM %g %g' % (responseValue,
responseValue), extent, cellSize)
gp.DefineProjection_management(outputResponseRaster,
coordinateSystem)
if buildPyramids:
gp.BuildPyramids_Management(outputResponseRaster)
if outputErrorRaster is not None:
errorValue = r('predict(model, newdata=data.frame(0),
type="response", se.fit=TRUE)$se.fit')
gp.CreateRandomRaster_management(os.path.dirname(outputErrorRaster),
os.path.basename(outputErrorRaster), u'UNIFORM %g %g' % (errorValue,
errorValue), extent, cellSize)
gp.DefineProjection_management(outputErrorRaster,
coordinateSystem)
if buildPyramids:
gp.BuildPyramids_Management(outputResponseRaster)
# Otherwise, the model contains at least one predictor
# variable, do the prediction.
else:
tempDir = TemporaryDirectory()
# First prepare the input rasters for the prediction.
_PreparePredictorRasters(r, gp, tempDir, constantPredictorNames,
rasterPredictorNames, predictorRasters, templateRaster, templateDescribe,
coordinateSystem, cellSize, extent, resamplingTechniques)
# Do the prediction. This creates files in GDAL-compatible
# .bil format.
#
# Force the logging system to log R Info messages as
# Debug so that the user is not spammed with "Closing
# GDAL dataset handle" messages. There appears to be
# no way to suppress these messages from within R.
Logger.Info(_(u'Predicting...'))
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[GLM.__module__].__file__),
'Utils.r'), False)
R.EvaluateFile(os.path.join(os.path.dirname(sys.modules[GLM.__module__].__file__),
'PredictModel.r'), False)
oldLoggingLevel = r.LogInfoAsDebug
r.LogInfoAsDebug = True
try:
if r('exists("rPackage")') and r['rPackage'] in
['randomForest', 'party']:
trainingData = 'trainingData'
else:
trainingData = 'model$data'
if r('exists("rPackage")'):
rPackage = '"' + r['rPackage'] + '"'
else:
rPackage = 'NULL'
if constantPredictorNames is not None and
constantPredictorValues is not None:
for i in range(len(constantPredictorValues)):
try:
constantPredictorValues[i] =
int(constantPredictorValues[i])
except:
try:
constantPredictorValues[i] =
float(constantPredictorValues[i])
except:
pass
r['constantsForPredictors'] =
dict(zip(constantPredictorNames, constantPredictorValues))
else:
r('constantsForPredictors <- NULL')
if cutoff is not None:
cutoff = repr(cutoff)
else:
cutoff = 'NULL'
tempResponseFile = os.path.join(tempDir.Path,
u'temp_response')
if outputErrorRaster is None:
r('PredictModelForArcGISRasters(model, %s, %s,
rastersForPredictors, constantsForPredictors, "%s",
ignoreOutOfRangeValues=%s, cutoff=%s)' % (trainingData, rPackage,
tempResponseFile.replace('\\', '\\\\'), str(ignoreOutOfRangeValues).upper(),
cutoff))
else:
tempErrorFile = os.path.join(tempDir.Path,
u'temp_standarderror')
r('PredictModelForArcGISRasters(model, %s, %s,
rastersForPredictors, constantsForPredictors, "%s", "%s",
ignoreOutOfRangeValues=%s, cutoff=%s)' % (trainingData, rPackage,
tempResponseFile.replace('\\', '\\\\'), tempErrorFile.replace('\\', '\\\\'),
str(ignoreOutOfRangeValues).upper(), cutoff))
finally:
r.LogInfoAsDebug = oldLoggingLevel
# Copy the .bil files to the destination rasters.
Logger.Info(_(u'Creating outputs...'))
from GeoEco.Datasets import Dataset
from GeoEco.Datasets.GDAL import GDALDataset
from GeoEco.Datasets.ArcGIS import ArcGISWorkspace, ArcGISRaster
bilFile = GDALDataset(tempResponseFile + '.bil',
lazyPropertyValues={'SpatialReference':
Dataset.ConvertSpatialReference('arcgis', coordinateSystem, 'obj')})
workspace =
ArcGISWorkspace(os.path.dirname(outputResponseRaster), ArcGISRaster,
pathCreationExpressions=[os.path.basename(outputResponseRaster)])
try:
workspace.ImportDatasets(bilFile.QueryDatasets(reportProgress=False), {False:
'Add', True: 'Replace'}[overwriteExisting], reportProgress=False,
calculateStatistics=True, buildRAT=True, buildPyramids=buildPyramids)
finally:
del bilFile
del workspace
if outputErrorRaster is not None:
bilFile = GDALDataset(tempErrorFile + '.bil',
lazyPropertyValues={'SpatialReference':
Dataset.ConvertSpatialReference('arcgis', coordinateSystem, 'obj')})
workspace =
ArcGISWorkspace(os.path.dirname(outputErrorRaster), ArcGISRaster,
pathCreationExpressions=[os.path.basename(outputErrorRaster)])
try:
workspace.ImportDatasets(bilFile.QueryDatasets(reportProgress=False), {False:
'Add', True: 'Replace'}[overwriteExisting], reportProgress=False,
calculateStatistics=True, buildPyramids=buildPyramids)
finally:
del bilFile
del workspace
# Delete R variables assigned by this function.
finally:
r('if (exists("rastersForPredictors")) rm("rastersForPredictors")')
r('if (exists("model")) rm("model")')
r('if (exists("trainingData")) rm("trainingData")')
r('if (exists("rPackage")) rm("rPackage")')
r('if (exists("xVar")) rm("xVar")')
r('if (exists("yVar")) rm("yVar")')
r('if (exists("zVar")) rm("zVar")')
r('if (exists("mVar")) rm("mVar")')
r('if (exists("coordinateSystem")) rm("coordinateSystem")')
r('if (exists("f20985982305")) rm("f20985982305", pos=globalenv())')
r('if (exists("data20985982305")) rm("data20985982305",
pos=globalenv())')
r('if (exists("fam20985982305")) rm("fam20985982305",
pos=globalenv())')
def _PreparePredictorRasters(r, gp, tempDir, constantPredictorNames,
rasterPredictorNames, predictorRasters, templateRaster, templateDescribe,
coordinateSystem, cellSize, extent, resamplingTechniques):
# Build a dictionary that maps predictor variable names in the
# model to rasters provided by the caller.
rastersForPredictors = {}
createXRaster = False
createYRaster = False
if r('exists("rPackage")') and r['rPackage'] in ['rpart', 'randomForest']:
allVars = r('all.vars(model$terms)')[1:]
if isinstance(allVars, basestring):
allVars = [allVars]
elif r('exists("rPackage")') and r['rPackage'] == 'nlme':
fixed = r('all.vars(fixed)')[1:]
if isinstance(fixed, basestring):
fixed = [fixed]
random = r('all.vars(random)')
if isinstance(random, basestring):
random = [random]
allVars = list(set(fixed).union(set(random)))
elif r('exists("rPackage")') and r['rPackage'] == 'party':
allVars =
r('all.vars(model@data@formula$input)')
if isinstance(allVars, basestring):
allVars = [allVars]
else:
allVars = r('all.vars(model$formula)')[1:]
if isinstance(allVars, basestring):
allVars = [allVars]
for variable in allVars:
if variable not in rasterPredictorNames and (constantPredictorNames
is None or variable not in constantPredictorNames):
if variable == r['xVar']:
createXRaster = True
elif variable == r['yVar']:
createYRaster = True
else:
Logger.RaiseException(ValueError(_('The predictor variable
%(var)s appears in the model\'s formula but does not appear in the list of
model variable names for predictor rasters provided to this tool. Add that
predictor variable to the list and try again.') % {u'var': variable}))
elif variable in rasterPredictorNames:
rastersForPredictors[variable] =
predictorRasters[rasterPredictorNames.index(variable)]
# Generate the X and Y coordinate rasters, if needed and possible.
if createXRaster:
if r['coordinateSystem'] != coordinateSystem:
Logger.RaiseException(ValueError(_('The model formula includes
the X coordinate as predictor variable %(var)s, but the list of predictor
rasters does not include this variable. Because the model was fitted to
points that used a different coordinate system than the one you want to use
for the predictions, this tool cannot automatically generate a raster for the
X coordinates. Please generate one yourself and add it to the list of
predictor rasters') % {u'var': r['xVar']}))
xRaster = os.path.join(tempDir.Path, u'xvar')
oldLogInfoAsDebug = Logger.LogInfoAndSetInfoToDebug(_(u'Creating a
predictor raster for the variable "%(var)s" representing the X
coordinate...') % {u'var': r['xVar']})
try:
ArcGISRaster.CreateXRaster(xRaster, extent, cellSize, u'Center',
coordinateSystem)
finally:
Logger.SetLogInfoAsDebug(oldLogInfoAsDebug)
rastersForPredictors[r['xVar']] = xRaster
if createYRaster:
if r['coordinateSystem'] != coordinateSystem:
Logger.RaiseException(ValueError(_('The model formula includes
the Y coordinate as predictor variable %(var)s, but the list of predictor
rasters does not include this variable. Because the model was fitted to
points that used a different coordinate system than the one you want to use
for the predictions, this tool cannot automatically generate a raster for the
Y coordinates. Please generate one yourself and add it to the list of
predictor rasters') % {u'var': r['yVar']}))
yRaster = os.path.join(tempDir.Path, u'yvar')
oldLogInfoAsDebug = Logger.LogInfoAndSetInfoToDebug(_(u'Creating a
predictor raster for the variable "%(var)s" representing the Y
coordinate...') % {u'var': r['yVar']})
try:
ArcGISRaster.CreateYRaster(yRaster, extent, cellSize, u'Center',
coordinateSystem)
finally:
Logger.SetLogInfoAsDebug(oldLogInfoAsDebug)
rastersForPredictors[r['yVar']] = yRaster
# Check each predictor raster to see if it needs to be copied,
# projected or clipped before we can use it.
Logger.Info(_('Checking coordinate systems, extents, and cell sizes of
predictor rasters and reprojecting and clipping as needed to make them
conform to the template raster...'))
from GeoEco.Types import EnvelopeTypeMetadata
[templateLeft, templateBottom, templateRight, templateTop] =
EnvelopeTypeMetadata.ParseFromArcGISString(extent)
vars = copy.deepcopy(rastersForPredictors.keys()) # I'm going
to modify the dictionary in the loop below; not sure if deepcopy is needed,
but playing it safe.
rasters = copy.deepcopy(rastersForPredictors.values())
for i in range(len(vars)):
needToCopy = False
needToProject = False
needToClip = False
# If this is predictor raster is a raster layer, we need to
# make a copy of it in the file system so that it can be read
# by the R code using the rgdal package.
d = gp.Describe(rasters[i])
if d.DataType.lower() == u'rasterlayer':
needToCopy = True
# If this is not the template raster, check to see whether it
# needs to be projected or clipped.
if rasters[i] != templateRaster:
cs =
gp.CreateSpatialReference_management(d.SpatialReference).split(u';')[0]
if cs.lower() != coordinateSystem.lower():
Logger.Info(_(u'Projecting and clipping %(raster)s. Its
coordinate system does not match that of the template raster.') % {u'raster':
rasters[i]})
needToProject = True
elif abs(1.0 - cellSize / d.MeanCellWidth) > 0.00001:
Logger.Info(_(u'Projecting and clipping %(raster)s. Its cell
size (%(cs1)s) does not match the cell size of the template raster
(%(cs2)s).') % {u'raster': rasters[i], u'cs1': repr(d.MeanCellWidth), u'cs2':
repr(cellSize)})
needToProject = True
else:
[rasterLeft, rasterBottom, rasterRight, rasterTop] =
EnvelopeTypeMetadata.ParseFromArcGISString(d.Extent)
deltaLeft = (templateLeft - rasterLeft) / cellSize #
Difference, in cells, between the left edge of the template and this raster
deltaBottom = (templateBottom - rasterBottom) / cellSize #
Difference, in cells, between the bottom edge of the template and this raster
deltaRight = (templateRight - rasterRight) / cellSize #
Difference, in cells, between the right edge of the template and this raster
deltaTop = (templateTop - rasterTop) / cellSize #
Difference, in cells, between the top edge of the template and this raster
if deltaLeft < -0.01:
Logger.RaiseException(ValueError(_(u'The template raster
is too large; its left edge falls (%(e1)s) outside the left edge of the
predictor raster %(raster)s (%(e2)s). Please reduce the extent of the
template raster such that it does not exceed the extents of any of the
predictor rasters.') % {u'raster': rasters[i], u'e1': repr(templateLeft),
u'e2': repr(rasterLeft)}))
if deltaRight > 0.01:
Logger.RaiseException(ValueError(_(u'The template raster
is too large; its right edge falls (%(e1)s) outside the right edge of the
predictor raster %(raster)s (%(e2)s). Please reduce the extent of the
template raster such that it does not exceed the extents of any of the
predictor rasters.') % {u'raster': rasters[i], u'e1': repr(templateRight),
u'e2': repr(rasterRight)}))
if deltaTop > 0.01:
Logger.RaiseException(ValueError(_(u'The template raster
is too large; its top edge falls (%(e1)s) outside the top edge of the
predictor raster %(raster)s (%(e2)s). Please reduce the extent of the
template raster such that it does not exceed the extents of any of the
predictor rasters.') % {u'raster': rasters[i], u'e1': repr(templateTop),
u'e2': repr(rasterTop)}))
if deltaBottom < -0.01:
Logger.RaiseException(ValueError(_(u'The template raster
is too large; its bottom edge falls (%(e1)s) outside the bottom edge of the
predictor raster %(raster)s (%(e2)s). Please reduce the extent of the
template raster such that it does not exceed the extents of any of the
predictor rasters.') % {u'raster': rasters[i], u'e1': repr(templateBottom),
u'e2': repr(rasterBottom)}))
if abs(deltaLeft) < 0.01 and abs(deltaBottom) < 0.01 and
abs(deltaRight) < 0.01 and abs(deltaTop) < 0.01 and d.Width ==
templateDescribe.Width and d.Height == templateDescribe.Height:
Logger.Debug(_(u'%(raster)s has the same coordinate
system, cell size, and extent as the template raster.') % {u'raster':
rasters[i]})
elif abs(deltaLeft - round(deltaLeft)) < 0.01 and
abs(deltaBottom - round(deltaBottom)) < 0.01 and abs(deltaRight -
round(deltaRight)) < 0.01 and abs(deltaTop - round(deltaTop)) < 0.01:
Logger.Info(_(u'Clipping %(raster)s. Although it has the
same coordinate system and cell size as the template raster and is aligned on
the same grid coordinates, it has a different extent.') % {u'raster':
rasters[i]})
needToClip = True
else:
Logger.Info(_(u'Projecting and clipping %(raster)s.
Although it has the same coordinate system and cell size as the template
raster, it is aligned on different grid coordinates and has a different
extent.') % {u'raster': rasters[i]})
needToProject = True
# If this raster does not need to be copied, projected, or
# clipped, go on to the next one.
if not (needToCopy or needToProject or needToClip):
continue
# If we need to project or clip, do it now using a function we
# developed specifically for this purpose.
if needToProject or needToClip:
# Use the caller's resampling technique, if one was
# provided. Otherwise use BILINEAR if it is a
# floating-point raster, or NEAREST if it is not.
if resamplingTechniques is not None and vars[i] in
rasterPredictorNames and rasterPredictorNames.index(vars[i]) <
len(resamplingTechniques):
resamplingTechnique =
resamplingTechniques[rasterPredictorNames.index(vars[i])]
elif d.PixelType[0] in ['F', 'f']:
resamplingTechnique = u'BILINEAR'
else:
resamplingTechnique = u'NEAREST'
# Project/clip it to the template.
tempRaster = os.path.join(tempDir.Path, u'projected%i' % i)
ArcGISRaster.ProjectToTemplate(rasters[i], templateRaster,
tempRaster, resamplingTechnique)
# If we got here but did not need to project or clip, we
# needed to copy. Create a copy of the raster in ArcInfo
# binary grid format.
else:
tempRaster = os.path.join(tempDir.Path, u'copied%i' % i)
ArcGISRaster.CopySilent(rasters[i], tempRaster)
# If we had to project or clip, make absolutely sure that the
# projected or clipped raster has the expected number of rows
# and columns. This safety check is to protect the user from
# the difficulty in getting ArcGIS to clip a raster to the
# exact extent desired. The function we used above is supposed
# to handle it, but we check again here for safety.
if needToProject or needToClip:
d = gp.Describe(tempRaster)
if d.Height != templateDescribe.Height or d.Width !=
templateDescribe.Width:
Logger.RaiseException(RuntimeError(_(u'Internal error in this
tool. Please contact the MGET development team for assistance. Error details:
after %(raster)s was projected and/or clipped, the resulting raster had
%(cols1)i columns and %(rows1)i rows, while the template raster has %(cols2)i
columns and %(rows2)i rows. This tool expected that the resulting raster
would have the same number of rows and columns as the template raster. You
can work around this problem by projecting and/or clipping the raster
yourself, so it has the same coordinate system, cell size, and extent as the
template raster.') % {u'raster': rasters[i], u'rows1': d.Height, u'cols1':
d.Width, u'rows2': templateDescribe.Height, u'cols2':
templateDescribe.Width}))
# Update our dictionary to use the projected/clipped/copied
# raster.
rastersForPredictors[vars[i]] = tempRaster
# Pass the dictionary that maps predictor variables to rasters to
# R, so the R code can read it.
r['rastersForPredictors'] = rastersForPredictors
###############################################################################
# Metadata: module
###############################################################################
from GeoEco.ArcGIS import ArcGISDependency, ArcGISExtensionDependency
from GeoEco.DatabaseAccess.ArcGIS import ArcGIS91SelectCursor
from GeoEco.Dependencies import PythonModuleDependency
from GeoEco.R import RDependency, RPackageDependency
from GeoEco.Metadata import *
from GeoEco.Types import *
AddModuleMetadata(shortDescription=_(u'Provides methods for modeling and
prediction.'))
###############################################################################
# Metadata: GLM class
###############################################################################
AddClassMetadata(GLM,
shortDescription=_(u'Provides methods for modeling and prediction using
Generalized Linear Models (GLMs).'),
isExposedAsCOMServer=True,
comIID=u'{4A349150-2A25-4B35-851C-A9BFFADF53F3}',
comCLSID=u'{16411D8A-C8CA-40C6-A9D3-F34EA8CD8D6A}')
# Public method: GLM.FitToArcGISTable
AddMethodMetadata(GLM.FitToArcGISTable,
shortDescription=_(u'Fits a generalized linear model (GLM) to data in a
table using the R glm function.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Fit GLM'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Linear
Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 6, 0)])
AddArgumentMetadata(GLM.FitToArcGISTable, u'cls',
typeMetadata=ClassOrClassInstanceTypeMetadata(cls=GLM),
description=_(u'%s class or an instance of it.') % GLM.__name__)
AddArgumentMetadata(GLM.FitToArcGISTable, u'inputTable',
typeMetadata=ArcGISTableViewTypeMetadata(mustExist=True),
description=_(
u"""ArcGIS table, table view, feature class, or feature layer
containing the data to which the model should be fitted."""),
arcGISDisplayName=_(u'Input table'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'outputModelFile',
typeMetadata=FileTypeMetadata(mustBeDifferentThanArguments=[u'inputTable'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""Output file to receive the fitted model. The file will not be in a
user-readable format. After the model is fitted, you can provide the
file to other tools that perform further analysis or visualization of
the fitted model.
It is suggested, but not required, that you give the file an .Rdata
extension."""),
direction=u'Output',
arcGISDisplayName=_(u'Output model file'))
_FormulaDescription = _(
u"""The formula must be in the format expected by the R glm function::
response ~ term1 + term2 + ... + termN
response is the table field that will be modeled as the response
variable and the terms are the table fields that will serve as the
predictor variables. The field names are case sensitive. If any field
used in the formula is NULL for a given row, that row will not be used
in fitting the model.
For example, if you have a field Presence that indicates the presence
or absence of a species (1 or 0) and you want to model it in terms of
sampled environmental covariates stored in the SST, ChlDensity, and
Depth fields, you would use the formula::
Presence ~ SST + ChlDensity + Depth
By default, all terms are treated as continuous variables. To indicate
that a term should be treated as a categorical variable, use the
factor function. For example, if SubstrateType is an integer code that
should be treated as categorical::
Presence ~ SST + ChlDensity + Depth + factor(SubstrateType)
The model terms may also use these operators:
* The : operator denotes the interaction of variables a and b. For
example: a:b.
* The * operator denotes "crossing". For example, a*b is identical to
a+b+a:b.
* The ^ operator denotes crossing to the Nth degree. For example,
(a+b+c)^2 is identical to (a+b+c)*(a+b+c) which in turn expands to a
formula containing the main effects for a, b and c together with
their second-order interactions.
* The %in% operator indicates that the terms on its left are nested
within those on the right. For example a + b %in% a expands to the
formula a + a:b.
* The - operator (minus) removes the specified terms, so that
(a+b+c)^2 - a:b is identical to a + b + c + b:c + a:c. It can also
used to remove the intercept term: y ~ x - 1 is a line through the
origin. A model with no intercept can be also specified as y ~ x + 0
or y ~ 0 + x.
While formulae usually involve just variable names, they can also
involve arithmetic expressions. The formula log(y) ~ a + log(x) is
quite legal. When such arithmetic expressions involve operators which
are also used symbolically in model formulae, there can be confusion
between arithmetic and symbolic operator use.
To avoid this confusion, the function I() can be used to bracket those
portions of a model formula where the operators are used in their
arithmetic sense. For example, in the formula y ~ a + I(b+c), the term
b+c is to be interpreted as the sum of b and c.
Please see the topics "glm" and "formula" in the R documentation for
more information.""")
AddArgumentMetadata(GLM.FitToArcGISTable, u'formula',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""Formula that specifies the table field that is the response
variable and the table fields that are the terms of the model.
""" + _FormulaDescription),
arcGISDisplayName=_(u'Formula'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'family',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'binomial',
u'gaussian', u'Gamma', u'inverse.gaussian', u'poisson', u'quasi',
u'quasibinomial', u'quasipoisson']),
description=_(
u"""Name of the family for the model. The family, together with the
link function and variance parameters, describes the error
distribution that will be used by the model. The family name is case
sensitive."""),
arcGISDisplayName=_(u'Family'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'where',
typeMetadata=SQLWhereClauseTypeMetadata(canBeNone=True),
description=ArcGIS91SelectCursor.__init__.__doc__.Obj.GetArgumentByName(u'where').Description,
arcGISParameterDependencies=[u'inputTable'],
arcGISDisplayName=_(u'Where clause'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'link',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'1/mu^2',
u'cauchit', u'cloglog', u'identity', u'inverse', u'log', u'logit', u'probit',
u'sqrt'], canBeNone=True),
description=_(
u"""Name of the link function for the model. The allowed link
functions depend on the model family:
* binomial family - cauchit, cloglog, log, logit, and probit
* gaussian family - identity, inverse, and log
* Gamma family - identity, inverse, and log
* inverse.gaussian family - 1/mu^2, inverse, identity, and log
* poisson family - identity, log, and sqrt
* quasi family - 1/mu^2, cloglog, identity, inverse, log, logit, probit, and
sqrt
* quasibinomial family - cauchit, cloglog, log, logit, and probit
* quasipoisson family - identity, log, and sqrt
If a link function is not specified, the one that is used by default
also depends on the model family:
* binomial family - logit
* gaussian family - identity
* Gamma family - inverse
* inverse.gaussian family - 1/mu^2
* poisson family - log
* quasi family - identity
* quasibinomial family - logit
* quasipoisson family - log
"""),
arcGISDisplayName=_(u'Link function'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'variance',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'constant',
u'mu(1-mu)', u'mu', u'mu^2', u'mu^3'], canBeNone=True),
description=_(
u"""Variance function to use when the "quasi" family is used. For all
families other than quasi, this parameter is ignored because the
variance function is determined by the family. If the quasi family is
used but a variance function is not specified, the "constant" variance
function will be used by default."""),
arcGISDisplayName=_(u'Variance function for quasi family'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'xColumnName',
typeMetadata=ArcGISFieldTypeMetadata(canBeNone=True),
description=_(
u"""Name to use in the formula for the X coordinates of point
features. If the input table is a point feature class or layer, the X
coordinates will be extracted from the points and be accessible in the
formula using the name provided for this parameter."""),
arcGISDisplayName=_(u'Name to use for X coordinates of points'),
arcGISCategory=_(u'Point feature options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'yColumnName',
typeMetadata=ArcGISFieldTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'xColumnName']),
description=_(
u"""Name to use in the formula for the Y coordinates of point
features. If the input table is a point feature class or layer, the Y
coordinates will be extracted from the points and be accessible in the
formula using the name provided for this parameter."""),
arcGISDisplayName=_(u'Name to use for Y coordinates of points'),
arcGISCategory=_(u'Point feature options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'zColumnName',
typeMetadata=ArcGISFieldTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'xColumnName', u'yColumnName']),
description=_(
u"""Name to use in the formula for the Z coordinates of point
features. If the input table is a point feature class or layer that
has Z coordinates, the Z coordinates will be extracted from the points
and be accessible in the formula using the name provided for this
parameter."""),
arcGISDisplayName=_(u'Name to use for Z coordinates of points'),
arcGISCategory=_(u'Point feature options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'mColumnName',
typeMetadata=ArcGISFieldTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'xColumnName', u'yColumnName',
u'zColumnName']),
description=_(
u"""Name to use in the formula for the measure values of point
features. If the input table is a point feature class or layer that
has measure values, the measure values will be extracted from the
points and be accessible in the formula using the name provided for this
parameter."""),
arcGISDisplayName=_(u'Name to use for M values of points'),
arcGISCategory=_(u'Point feature options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'selectionMethod',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
allowedValues=[u'Stepwise backward'], makeLowercase=True),
description=_(
u"""Automated model selection method to execute after fitting the
model using the original formula you specified. If automated model
selection is performed, the output model file will contain the final
model that was selected, and the output summary and plots will be for
this model.
The selection methods currently available are:
* Stepwise backward - backward stepwise model selection by AIC.
The basic idea of this method is to keep dropping model terms from
the original formula so long as a better model fit can be achieved.
Model selection occurs in a loop. First, the Akaike Information
Criterion (AIC) is computed for the current model. Then, N candidate
model are generated, where N equals the number of terms in the
current model. Each candidate model drops a different single term
from the model. The AIC is computed for each candidate model, and
the candidate model that provides the greatest reduction in AIC
becomes the new current model. The loop proceeds until the AIC can
no longer be reduced by dropping terms.
Additional methods may be implemented in a future release of this
tool.
Automated model selection will increase the run time and possibly the
memory utilization of this tool. The amount of increase depends on the
complexity of your model and the amount of data in your table.
Model selection can be a difficult task. You should never blindly rely
on an automated selection method that you do not fully understand. For
the best results, always consult a statistician."""),
arcGISDisplayName=_(u'Automated model selection method'),
arcGISCategory=_(u'Automated model selection options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'logSelectionDetails',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, detailed information will be logged as automated model
selection proceeds. You can use this to monitor the progress of
automated model selection and diagnose how the selection method
arrived at the final model.
If False, no detailed information will be logged during automated
model selection."""),
arcGISDisplayName=_(u'Log model selection details'),
arcGISCategory=_(u'Automated model selection options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'writeSummaryFile',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write summary information about the fitted
model to a text file. (This is the same information that the tool
outputs as log messages.) If automated model selection is performed,
the output text file will contain summary information for both the
initial model and the final model.
The file will have the name X_summary.txt, where X is the name of the
output model file, minus any extension."""),
arcGISDisplayName=_(u'Write model summary file'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'writeDiagnosticPlots',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write six diagnostic plots for the fitted
model. If automated model selection is performed, plots will be
generated only for the final model.
Each plot will be written to a PNG file:
* X_resid_fit.png - residuals vs. fitted values
* X_qq.png - normal Q-Q plot
* X_scale_loc.png - scale-location plot of sqrt(abs(residuals)) vs.
fitted values
* X_cooks.png - Cook's distances vs. row labels
* X_resid_lev.png - residuals vs. leverages
* X_cooks_lev.png - Cook's distances vs. leverage/(1-leverage)
In the file names above, X is the name of the output model file, minus
any extension. Please see the R documentation for the plot.lm function
for more information about the plots."""),
arcGISDisplayName=_(u'Write diagnostic plots'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'numDiagLabels',
typeMetadata=IntegerTypeMetadata(minValue=0),
description=_(
u"""Number of extreme points to label in diagnostic plots.
This parameter has no effect when diagnostic plots are not
written."""),
arcGISDisplayName=_(u'Number of extreme points to label in diagnostic
plots'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'diagLabelField',
typeMetadata=ArcGISFieldTypeMetadata(mustExist=True, canBeNone=True),
description=_(
u"""Table field to for labeling extreme points in diagnostic plots.
Usually, you choose a field that uniquely identifies the table row. If
you do not specify a field, R will assign labels for you, and it may
be difficult to determine which row a given point corresponds to.
This parameter has no effect when diagnostic plots are not
written."""),
arcGISParameterDependencies=[u'inputTable'],
arcGISDisplayName=_(u'Field for labeling extreme points in diagnostic
plots'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'writeTermPlots',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write a partial plot for each term in the
fitted model's formula. If automated model selection is performed,
plots will be generated only for the final model.
Each plot will be written to a PNG file named X_termY.png, where X is
the name of the output model file, minus any extension, and Y is the
term number in the model (minimum of two digits). For example, if the
model had this formula::
Presence ~ SST + log(ChlDensity) + Depth
The output files would be X_term01.png, X_term02.png, and
X_term03.png, which correspond to the SST, log(ChlDensity), and Depth
terms respectively."""),
arcGISDisplayName=_(u'Write plots of model terms'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'residuals',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, residuals will be plotted in model term plots of
continuous terms (i.e. terms that are not factors).
Residuals are calculated using the method employed by the mgcv
package. That is, they are "the working residuals from the IRLS
iteration weighted by the IRLS weights. [These] are the residuals that
would be obtained by dropping the term concerned from the model, while
leaving all other estimates fixed (i.e. the estimates for the term
plus the residuals)." """),
arcGISDisplayName=_(u'Plot residuals'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'xAxis',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, a straight line will be drawn at y=0 in model term plots
of continuous terms (i.e. terms that are not factors)."""),
arcGISDisplayName=_(u'Plot x-axis'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'commonScale',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, model term plots of continuous terms (i.e. terms that are
not factors) will have the same y-axis scale, allowing the relative
importance of different terms to be ascertained by comparing the plots
side by side."""),
arcGISDisplayName=_(u'Use common y-axis scale'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'plotFileFormat',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'emf', u'png'],
makeLowercase=True),
description=_(
u"""Plot file format, one of:
* emf - Windows enhanced metafile (EMF) format. This is a vector
format that may be printed and resized without any pixelation and is
therefore suitable for use in printable documents that recognize
this format (e.g. Microsoft Word or Microsoft Visio).
* png - Portable network graphics (PNG) format. This is a compressed,
lossless, highly portable raster format suitable for use in web
pages or other locations where a raster format is desired. Most
scientific journals accept PNG; they typically request that files
have a resolution of at least 1000 DPI.
"""),
arcGISDisplayName=_(u'Plot file format'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'res',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""PNG plot file resolution, in dots per inch (DPI). The default is
set to a high value (1000) because this is the minimum resolution
typically required by scientific journals that accept figures in PNG
format.
This parameter is ignored for EMF format because it is a vector
format."""),
arcGISDisplayName=_(u'Plot resolution, in DPI'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'width',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""Plot file width in thousandths of inches (for EMF format; e.g. the
value 3000 is 3 inches) or pixels (for PNG format)."""),
arcGISDisplayName=_(u'Plot width'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'height',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""Plot file height in thousandths of inches (for EMF format; e.g. the
value 3000 is 3 inches) or pixels (for PNG format)."""),
arcGISDisplayName=_(u'Plot height'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'pointSize',
typeMetadata=FloatTypeMetadata(minValue=1.0),
description=_(
u"""The default pointsize of plotted text."""),
arcGISDisplayName=_(u'Default pointsize of plotted text'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'bg',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""PNG plot file background color. The color must be a valid name in
R's color palette, or "transparent" if there is no background color.
This parameter is ignored if the plot format file is EMF."""),
arcGISDisplayName=_(u'Plot background color'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.FitToArcGISTable, u'overwriteExisting',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the output files will be overwritten, if it exists. If
False, a ValueError will be raised if an output file exists."""),
initializeToArcGISGeoprocessorVariable=u'OverwriteOutput')
# Public method: GLM.PredictFromArcGISTable
AddMethodMetadata(GLM.PredictFromArcGISTable,
shortDescription=_(u'Given a fitted generalized linear model (GLM), this
tool predicts the response variable for each row of a table.'),
longDescription=_(
u"""If a table is not provided, the prediction will be done on the
training data used to fit the model.
On completion, the tool outputs statistics that summarize how well the
model's predictions match the observed values of the response
variable, unless a table is provided that does not contain the
observed values of the response variable."""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict GLM From Table'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Linear
Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 6, 0),
RPackageDependency(u'ROCR'), RPackageDependency(u'e1071'),
RPackageDependency(u'caret')]) # e1071 is required by caret
CopyArgumentMetadata(GLM.FitToArcGISTable, u'cls',
GLM.PredictFromArcGISTable, u'cls')
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by the Fit GLM
tool."""),
arcGISDisplayName=_(u'Input model file'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'inputTable',
typeMetadata=ArcGISTableViewTypeMetadata(canBeNone=True, mustExist=True),
description=_(
u"""ArcGIS table, table view, feature class, or feature layer
containing the data for which the prediction should be done.
This parameter is optional. If not provided, the prediction will be
done on the training data used to fit the model.
If a table is provided, it must have a field for each predictor
variable in the model. These fields must have the same names as the
predictor variables.
The table must have one or both of the following:
* A field containing the actual (observed) values of the response
variable. This field must have the same name as the response
variable. If it exists, the tool will calculate model performance
statistics by comparing the actual values to the values predicted by
the model.
* A field that you designate to recieve the predicted values.
This tool will report an error if both of these fields are
absent."""),
arcGISDisplayName=_(u'Input table'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'predictedValuesField',
typeMetadata=ArcGISFieldTypeMetadata(canBeNone=True),
description=_(
u"""Field to receive the predicted values produced by the model.
This parameter is optional. It allows you to record the predicted
values so you perform follow-up analysis."""),
arcGISDisplayName=_(u'Field to receive the predicted values of the
response variable'),
arcGISParameterDependencies=[u'inputTable'])
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'cutoff',
typeMetadata=FloatTypeMetadata(minValue=0., maxValue=1., canBeNone=True),
description=_(
u"""Cutoff to use when classifying the continuous probability output
by a binomial model into a binary result (0 or 1). Probabilities
greater than or equal to the cutoff are classified as 1; probabilities
less than the cutoff are classified as 0.
This parameter should not be used for models that are not intended to
perform binary classification.
For binomial models, if a value is not provided, the tool will
automatically select the value that maximizes the value of the Youden
index (see Perkins and Schisterman, 2006), thereby attempting to
minimize the misclassification rate of the model. This approach may
not be optimal for your application; we encourage you to review the
extensive discusson of cutoffs in the scientific literature and select
a value deliberately.
References
Perkins NJ, Schisterman EF (2006) The Inconsistency of "Optimal"
Cutpoints Obtained using Two Criteria based on the Receiver Operating
Characteristic Curve. American Journal of Epidemiology 163:
670-675."""),
arcGISDisplayName=_(u'Binary classification cutoff'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'where',
typeMetadata=SQLWhereClauseTypeMetadata(canBeNone=True),
description=ArcGIS91SelectCursor.__init__.__doc__.Obj.GetArgumentByName(u'where').Description,
arcGISParameterDependencies=[u'inputTable'],
arcGISDisplayName=_(u'Where clause'),
arcGISCategory=_(u'Prediction options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'ignoreOutOfRangeValues',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, predictions will not be made where predictor values fall
outside of the range of values used to fit the model.
If False, predictions will be attempted regardless of what the
predictor values are.
This parameter is set to True by default because many believe that it
is a bad practice to extrapolate a model beyond the range of values
used to fit it. But if your model provides a very strong fit, or you
have some other reason to believe it is very robust, you can set this
parameter to False to perform the extrapolation."""),
arcGISDisplayName=_(u'Ignore values outside the modeled range'),
arcGISCategory=_(u'Prediction options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'noDataValue',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Value to use when a prediction cannot be made (when, for
example, predictor values out of the modeled range).
If a value is not provided for this parameter, a database NULL value
will be stored in the field when a prediction cannot be made. If the
field cannot store NULL values, as is the case with shapefiles, the
value -9999 will be used."""),
arcGISDisplayName=_(u'Value to use when a prediction cannot be made'),
arcGISCategory=_(u'Prediction options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'outputPlotFile',
typeMetadata=FileTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'inputModelFile', u'inputTable'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""Diagnostic plot file to create for binary classification models.
By default, the plot will show a receiver operating characteristic
(ROC) curve for the predictions, but you can specify different
performance measures if desired. The ROC curve plots the true positive
rate against the false positive rate.
The file must have one of the following two extensions, which
determines the format that will be used:
* .emf - Windows enhanced metafile (EMF) format. This is a vector
format that may be printed and resized without any pixelation and is
therefore suitable for use in printable documents that recognize
this format (e.g. Microsoft Word or Microsoft Visio).
* .png - Portable network graphics (PNG) format. This is a compressed,
lossless, highly portable raster format suitable for use in web
pages or other locations where a raster format is desired. Most
scientific journals accept PNG; they typically request that files
have a resolution of at least 1000 DPI.
"""),
direction=u'Output',
arcGISDisplayName=_(u'Output diagnostic plot file'),
arcGISCategory=_(u'Binary classification options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'measure1',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'acc', u'cal',
u'chisq', u'ecost', u'err', u'f', u'fall', u'fnr', u'fpr', u'lift', u'mat',
u'mi', u'miss', u'npv', u'odds', u'ppv', u'pcfall', u'pcmiss', u'phi',
u'prec', u'rch', u'rec', u'rnp', u'rpp', u'sar', u'sens', u'spec', u'tnr',
u'tpr']),
description=_(
u"""The first performance measure to plot.
This measure will serve as the Y coordinate for the plot. If a second
measure is specified, it will serve as the X coordinate. Otherwise the
model cutoff will serve as the X coordinate.
When modeling a binary response, an important task is selecting a
cutoff. The cutoff is the value used to determine whether a given
predicted value of the response variable should be classified as
positive or negative. Because the model typically outputs predictions
along a continual range (e.g. between 0.0 and 1.0), you must determine
the portions of the range that represent positive and negative
responses. Response values less than the cutoff are classified as
negative and those greater than or equal to the cutoff are classified
as positive.
Plots that use the cutoff as the X coordinate show you a measure of
the model's performance for the range of cutoff values. You can use
this this plot to select the cutoff value that maximizes, minimizes,
or otherwise optimizes a given performance measure. For example, if
you selected "acc" (accuracy) as the performance measure, you could
find the cutoff value that maximized the model's accuracy by finding
the highest point on the plot and then looking up the cutoff value on
the X axis.
Plots that use a second performance measure as the X coordinate allow
you to balance one measure of the model's performance against another.
The plot will be color-coded by cutoff value, allowing you to look up
the cutoff for a given combination of the two performance measures.
Selecting an optimal cutoff will often involve making a tradeoff
between the two measures.
For example, to create a classic receiver operating characteristic
(ROC) curve, select "tpr" (true positive rate) for the first measure
and "fpr" (false positive rate) for the second measure.
For the first performance measure, you can select from the following
list. This list is taken from the documentation of the R ROCR package
that implements the plots. Some of them do not allow selection of a
second performance measure (you must omit it when invoking this tool),
as noted in the measure's description.
In these descriptions, Y and Yhat are random variables representing
the class and the prediction for a randomly drawn sample,
respectively. + and - are the positive and negative class,
respectively. The following abbreviations are used for for empirical
quantities: P (# positive samples), N (# negative samples), TP (# true
positives), TN (# true negatives), FP (# false positives), FN (# false
negatives).
* acc - Accuracy. P(Yhat = Y). Estimated as: (TP+TN)/(P+N).
* cal - Calibration error. The calibration error is the absolute
difference between predicted confidence and actual reliability. This
error is estimated at all cutoffs by sliding a window of size 100
across the range of possible cutoffs. E.g., if for several positive
samples the output of the classifier is around 0.75, you might
expect from a well-calibrated classifier that the fraction of them
which is correctly predicted as positive is also around 0.75. In a
well-calibrated classifier, the probabilistic confidence estimates
are realistic. Only for use with probabilistic output (i.e. scores
between 0 and 1; some of the other measures actually support values
between -1 and 1).
* chisq - Chi square test statistic. Note that R might raise a warning
if the sample size is too small.
* ecost - Expected cost. For details on cost curves, cf. Drummond &
Holte 2000, 2004. ecost has an obligatory x axis, the so-called
'probability-cost function'; thus you may not specify a second
performance measure.
* err - Error rate. P(Yhat != Y). Estimated as: (FP+FN)/(P+N).
* f - Precision-recall F measure (van Rijsbergen, 1979). Weighted
harmonic mean of precision (P) and recall (R). F = 1/ (alpha*1/P +
(1-alpha)*1/R). For this tool, alpha is always 1/2, so the mean is
balanced.
* fall - Fallout. Same as fpr.
* fnr - False negative rate. P(Yhat = - | Y = +). Estimated as: FN/P.
* fpr - False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
* lift - Lift value. P(Yhat = + | Y = +)/P(Yhat = +).
* mat - Matthews correlation coefficient. Same as phi.
* mi - Mutual information. I(Yhat, Y) := H(Y) - H(Y | Yhat), where H
is the (conditional) entropy. Entropies are estimated naively (no
bias correction).
* miss - Miss. Same as fnr.
* npv - Negative predictive value. P(Y = - | Yhat = -). Estimated as:
TN/(TN+FN).
* odds - Odds ratio. (TP*TN)/(FN*FP). Note that odds ratio produces
Inf or NA values for all cutoffs corresponding to FN=0 or FP=0. This
can substantially decrease the plotted cutoff region.
* pcfall - Prediction-conditioned fallout. P(Y = - | Yhat = +).
Estimated as: FP/(TP+FP).
* pcmiss - Prediction-conditioned miss. P(Y = + | Yhat = -). Estimated
as: FN/(TN+FN).
* phi - Phi correlation coefficient. (TP*TN -
FP*FN)/(sqrt((TP+FN)*(TN+FP)*(TP+FP)*(TN+FN))). Yields a number
between -1 and 1, with 1 indicating a perfect prediction, 0 indicating
a random prediction. Values below 0 indicate a worse than random
prediction.
* ppv - Positive predictive value. P(Y = + | Yhat = +). Estimated as:
TP/(TP+FP).
* prec - Precision. Same as ppv.
* rch - ROC convex hull. A ROC (=tpr vs fpr) curve with concavities
(which represent suboptimal choices of cutoff) removed (Fawcett
2001). Since the result is already a parametric performance curve,
it cannot be used in combination with other measures (thus you may
not specify a second performance measure).
* rec - Recall. Same as tpr.
* rnp - Rate of negative predictions. P(Yhat = -). Estimated as:
(TN+FN)/(TP+FP+TN+FN).
* rpp - Rate of positive predictions. P(Yhat = +). Estimated as:
(TP+FP)/(TP+FP+TN+FN).
* sar - Score combinining performance measures of different
characteristics, in the attempt of creating a more "robust" measure
(cf. Caruana R., ROCAI2004): SAR = 1/3 * (Accuracy + Area under the
ROC curve + Root mean-squared error).
* sens - Sensitivity. Same as tpr.
* spec - Specificity. Same as tnr.
* tnr - True negative rate. P(Yhat = - | Y = -). Estimated as: TN/N.
* tpr - True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
"""),
arcGISDisplayName=_(u'First performance measure to plot'),
arcGISCategory=_(u'Binary classification options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'measure2',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
allowedValues=[u'acc', u'cal', u'chisq', u'err', u'f', u'fall', u'fnr',
u'fpr', u'lift', u'mat', u'mi', u'miss', u'npv', u'odds', u'ppv', u'pcfall',
u'pcmiss', u'phi', u'prec', u'rec', u'rnp', u'rpp', u'sar', u'sens', u'spec',
u'tnr', u'tpr']),
description=_(
u"""The second performance measure to plot.
If specified, this performance measure will be used as the X
coordinate of the plot instead of the model cutoff value. Please see
the documentation for the First Performance Measure for more
information."""),
arcGISDisplayName=_(u'Second performance measure to plot'),
arcGISCategory=_(u'Binary classification options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'colorize',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True (the default) and a second performance measure is selected
for plotting, the plot will be colorized by the cutoff value.
Otherwise, the plot will be black."""),
arcGISDisplayName=_(u'Colorize plot by cutoff value'),
arcGISCategory=_(u'Binary classification options'))
AddArgumentMetadata(GLM.PredictFromArcGISTable, u'outputSummaryFile',
typeMetadata=FileTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'inputModelFile', u'inputTable',
u'outputPlotFile'], deleteIfParameterIsTrue=u'overwriteExisting',
createParentDirectories=True),
description=_(
u"""Text file to receive the model summary statistics that this tool
reports as log messages."""),
direction=u'Output',
arcGISDisplayName=_(u'Output summary statistics file'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'res',
GLM.PredictFromArcGISTable, u'res')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'width',
GLM.PredictFromArcGISTable, u'width')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'height',
GLM.PredictFromArcGISTable, u'height')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'pointSize',
GLM.PredictFromArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'bg', GLM.PredictFromArcGISTable,
u'bg')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'overwriteExisting',
GLM.PredictFromArcGISTable, u'overwriteExisting')
AddResultMetadata(GLM.PredictFromArcGISTable, u'updatedTable',
typeMetadata=ArcGISTableViewTypeMetadata(),
description=_(u'Updated table.'),
arcGISDisplayName=_(u'Updated table'),
arcGISParameterDependencies=[u'inputTable'])
AddResultMetadata(GLM.PredictFromArcGISTable, u'outputCutoff',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Output cutoff value, either the value provided as input or, if one
was not provided, the value selected automatically."""),
arcGISDisplayName=_(u'Output cutoff'))
# Public method: GLM.PredictFromArcGISRasters
AddMethodMetadata(GLM.PredictFromArcGISRasters,
shortDescription=_(u'Using a fitted generalized linear model (GLM), this
tool creates a raster representing the response variable predicted from
rasters representing the predictor variables.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict GLM From Rasters'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Linear
Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 6, 0),
RPackageDependency(u'rgdal')])
CopyArgumentMetadata(GLM.FitToArcGISTable, u'cls',
GLM.PredictFromArcGISRasters, u'cls')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'inputModelFile',
GLM.PredictFromArcGISRasters, u'inputModelFile')
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'outputResponseRaster',
typeMetadata=ArcGISRasterTypeMetadata(mustBeDifferentThanArguments=[u'inputModelFile'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""Output raster representing the predicted response.
The output raster will have the coordinate system, extent, and cell
size of the Template Raster. If a Template Raster is not specified,
the first Predictor Raster will be used as the template instead. The
prediction is performed for each cell of the template by extracting
the predictor values for that cell and processing them through the
fitted model.
Each predictor can either be obtained from a raster that gives the
values of it or assigned a constant value that is the same for all
cells. Accordingly, all predictors in the model must be listed under
the Raster Predictor Variables parameter of this tool or the Constant
Predictor Variables parameter (but not both).
For example, if your model used the formula::
Response ~ SST + Depth + DayOfYear
In this model, SST and Depth both vary spatially. These should be
listed under Raster Predictor Variables, and corresponding temperature
and depth rasters should be listed under Predictor Rasters.
On the other hand, DayOfYear does not vary spatially. Therefore it
should be listed under Constant Predictor Variables, and a value
should be given under Constant Values."""),
direction=u'Output',
arcGISDisplayName=_(u'Output response raster'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'templateRaster',
typeMetadata=ArcGISRasterLayerTypeMetadata(mustExist=True,
mustBeDifferentThanArguments=[u'inputModelFile', u'outputResponseRaster'],
canBeNone=True),
description=_(
u"""Template raster that defines the coordinate system, extent, and
cell size of the output raster produced by this tool.
If you do not specify a template raster, the first predictor raster
will be used instead."""),
arcGISDisplayName=_(u'Template raster'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'rasterPredictorNames',
typeMetadata=ListTypeMetadata(UnicodeStringTypeMetadata(),
canBeNone=True),
description=_(
u"""Predictor variables that should be obtained from rasters. For each
variable listed here, a raster must also be listed under the Predictor
Rasters parameter.
These names are case sensitive, and must appear exactly as they were
written in the formula when the model was fitted.
Please see the documentation for the Output Response Raster for more
discussion of this parameter."""),
arcGISDisplayName=_(u'Raster predictor variables'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'predictorRasters',
typeMetadata=ListTypeMetadata(ArcGISRasterLayerTypeMetadata(mustExist=True),
canBeNone=True, mustBeSameLengthAsArgument=u'rasterPredictorNames'),
description=_(
u"""Rasters for the raster predictor variables.
If you do not specify a Template Raster, the first predictor raster
will also be used as the template that defines the coordinate system,
extent, and cell size of the output raster produced by this tool.
All of the predictor rasters must have a coordinate system defined.
They must all have the same datum as the template, or a datum that
ArcGIS can automatically project to the template's datum without
requiring a Geographic Transformation to be specified.
The predictor rasters need not have the same coordinate system,
extent, or cell size as the template. If these characteristics differ
from the template raster, this tool will automatically project and
clip the predictor rasters to conform to the template raster. The
predictor rasters must be able to be projected to the template
raster's coordinate system without requiring the specification of a
geographic transformation. An error will be reported if a geographic
transformation must be specified. In this case, you must project the
predictor rasters manually before providing them to this tool.
By default, floating point predictor rasters will be projected using
bilinear interpolation and integer predictor rasters will be projected
using nearest neighbor assignment. Bilinear interpolation is
appropriate for continuous variables such as sea surface temperature,
while nearest neighbor is appropriate for categorical variables such
as bottom substrate type. If these defaults are not appropriate for
your predictors, specify different algorithms using the Resampling
Techniques for Predictor Rasters parameter."""),
arcGISDisplayName=_(u'Predictor rasters'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'constantPredictorNames',
typeMetadata=ListTypeMetadata(UnicodeStringTypeMetadata(),
canBeNone=True),
description=_(
u"""Predictor variables that should be given constant values for the
prediction. For each variable listed here, a value must also be listed
under the Constant Values parameter.
These names are case sensitive, and must appear exactly as they were
written in the formula when the model was fitted.
Please see the documentation for the Output Response Raster for more
discussion of this parameter."""),
arcGISDisplayName=_(u'Constant predictor variables'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'constantPredictorValues',
typeMetadata=ListTypeMetadata(UnicodeStringTypeMetadata(),
canBeNone=True, mustBeSameLengthAsArgument=u'constantPredictorNames'),
description=_(
u"""Values for the constant predictor variables.
The values must be strings but the tool will internally coerce them to
integers or floating point numbers as needed. For values that should
remain strings, do not specify quotation marks or other delimiters.
Please see the documentation for the Output Response Raster for more
discussion of this parameter."""),
arcGISDisplayName=_(u'Constant values'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'cutoff',
typeMetadata=FloatTypeMetadata(minValue=0., maxValue=1., canBeNone=True),
description=_(
u"""Cutoff to use when classifying the continuous probability output
by a binomial model into a binary result (0 or 1). This parameter
should not be specified for models that are not intended to perform
binary classification.
If a cutoff is provided, the output raster will have an integer data
type instead of a floating-point data type. Probabilities greater than
or equal to the cutoff will be classified as 1; probabilities less
than the cutoff will be classified as 0."""),
arcGISDisplayName=_(u'Binary classification cutoff'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'resamplingTechniques',
typeMetadata=ListTypeMetadata(elementType=UnicodeStringTypeMetadata(allowedValues=[u'NEAREST',
u'BILINEAR', u'CUBIC', u'MAJORITY'], makeLowercase=True), canBeNone=True),
description=_(
u"""Resampling technique to be used for each predictor raster if that
raster needs to be automatically projected, as described in the
documentation for the Predictor Rasters parameter.
The available resampling techniques are:
* NEAREST - Nearest neighbor assignment
* BILINEAR - Bilinear interpolation
* CUBIC - Cubic convolution
* MAJORITY - Majority resampling (requires ArcGIS 9.3 or later)
If you do not specify a resampling technique for a given input
predictor raster, BILINEAR will be used.
The ArcGIS documentation provides the following information about the
resampling techniques:
* The NEAREST option, which performs a nearest neighbor assignment, is
the fastest of the interpolation methods. It is primarily used for
categorical data, such as a land use classification, because it will
not change the cell values. Do not use NEAREST for continuous data,
such as elevation surfaces.
* The MAJORITY option is also intended for categorical data, but not
for continuous data. This option requires ArcGIS 9.3 or later.
* The BILINEAR option, bilinear interpolation, determines the new
value of a cell based on a weighted distance average of surrounding
cells. Do not use BILINEAR with categorical data, such as land use
classifications.
* The CUBIC option, cubic convolution, determines the new cell
value by fitting a smooth curve through the surrounding points.
These are most appropriate for continuous data and may cause some
smoothing; also, cubic convolution may result in the output raster
containing values outside the range of the input raster. Do not use
CUBIC with categorical data.
"""),
arcGISDisplayName=_(u'Resampling techniques'),
arcGISCategory=_(u'Prediction options'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, predictions will not be made where predictor values fall
outside of the range of values used to fit the model. These cells will
appear as NoData in the output raster.
If False, predictions will be attempted regardless of what the
predictor values are.
This parameter is set to True by default because many believe that it
is a bad practice to extrapolate a model beyond the range of values
used to fit it. But if your model provides a very strong fit, or you
have some other reason to believe it is very robust, you can set this
parameter to False to perform the extrapolation."""),
arcGISDisplayName=_(u'Ignore values outside the modeled range'),
arcGISCategory=_(u'Prediction options'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'outputErrorRaster',
typeMetadata=ArcGISRasterTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'inputModelFile', u'outputResponseRaster'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""Output raster representing the standard errors of the predicted
response.
If any pixel is NoData in any predictor raster (after it has been
projected and clipped as needed), it will be NoData in the output
raster as well. This is because the response variable cannot be
predicted by the model if any of the predictor variables are
missing."""),
direction=u'Output',
arcGISDisplayName=_(u'Output standard error raster'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'buildPyramids',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, pyramids will be built for the output raster, which will
improve its display speed in the ArcGIS user interface."""),
arcGISDisplayName=_(u'Build pyramids'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(GLM.PredictFromArcGISRasters, u'overwriteExisting',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the output raster will be overwritten, if it exists. If
False, a ValueError will be raised if the output raster exists."""),
initializeToArcGISGeoprocessorVariable=u'OverwriteOutput')
###############################################################################
# Metadata: GAM class
###############################################################################
AddClassMetadata(GAM,
shortDescription=_(u'Provides methods for modeling and prediction using
Generalized Additive Models (GAMs).'),
isExposedAsCOMServer=True,
comIID=u'{515D570A-CF61-463A-8250-C65B67BEE0C7}',
comCLSID=u'{D875D98A-6E4D-486B-B681-225FF40A0D7E}')
# Public method: GAM.FitToArcGISTable
AddMethodMetadata(GAM.FitToArcGISTable,
shortDescription=_(u'Fits a generalized additive model (GAM) to data in a
table.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Fit GAM'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Additive
Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 6, 0)])
AddArgumentMetadata(GAM.FitToArcGISTable, u'cls',
typeMetadata=ClassOrClassInstanceTypeMetadata(cls=GAM),
description=_(u'%s class or an instance of it.') % GAM.__name__)
CopyArgumentMetadata(GLM.FitToArcGISTable, u'inputTable',
GAM.FitToArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'outputModelFile',
GAM.FitToArcGISTable, u'outputModelFile')
AddArgumentMetadata(GAM.FitToArcGISTable, u'formula',
typeMetadata=GLM.FitToArcGISTable.__doc__.Obj.GetArgumentByName(u'formula').Type,
description=GLM.FitToArcGISTable.__doc__.Obj.GetArgumentByName(u'formula').Description
+ _(
u"""
A principal advantage of GAMs over GLMs is the ability to fit a model
using smoothing functions. The most common smoothing function is
called s::
Presence ~ SST + s(ChlDensity) + Depth
The available smoothing functions and their meanings and parameters
depend on which R package is used to fit the GAM:
* The R gam package provides two smoothing functions: s for spline
smooths and lo for loess smooths. For more information, please see
the documentation for these functions in the
`gam package documentation
<http://cran.r-project.org/web/packages/gam/gam.pdf>`_.
* The R mgcv package also provides two smoothing functions. The s
function fits a thin plate regression spline. The te function fits a
tensor product smooth using a cubic regression spline. Both
functions allow you to specify different spline types, such as cubic
regression splines, cubic regression splines with shrinkage, cyclic
cubic regression splines, and thin plate regression splines with
shrinkage. For more information, please see the documentation for
these functions in the
`mgcv package documentation
<http://cran.r-project.org/web/packages/mgcv/mgcv.pdf>`_."""),
arcGISDisplayName=GLM.FitToArcGISTable.__doc__.Obj.GetArgumentByName(u'formula').ArcGISDisplayName)
AddArgumentMetadata(GAM.FitToArcGISTable, u'family',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'binomial',
u'gaussian', u'Gamma', u'inverse.gaussian', u'poisson', u'negbin', u'quasi',
u'quasibinomial', u'quasipoisson']),
description=_(
u"""Name of the family for the model. The family, together with the
link function and variance parameters, describes the error
distribution that will be used by the model. The family name is case
sensitive.
The negbin family is only available when the mgcv R package is used to
fit the model. When negbin is used, the theta parameter must also be
specified (under Model Options)."""),
arcGISDisplayName=_(u'Family'))
AddArgumentMetadata(GAM.FitToArcGISTable, u'rPackage',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'gam', u'mgcv']),
description=_(
u"""R package to use when fitting the GAM.
* mgcv - the GAM package written by Simon Wood, an expert in
statistical modeling with smooth functions. This package allows
terms to be smoothed with a variety of spline types. Stepwise model
selection is not used with these models because terms can be fitted
with "shrinkage" splines, allowing them to drop out of the model
during the fitting process. Because this package is under active
development and maintenance and provides more features than the gam
package (below), it is the default.
* gam - the GAM package written by Trevor Hastie, one of the inventors
of GAMs. This package allows terms to be smoothed with splines or
loess functions and models to be optimized using stepwise model
selection. One disadvantage is that the Predict GAM From ArcGIS
Rasters tool cannot produce standard error rasters for models fitted
with this package.
For more information on the two packages, please see the
`gam package documentation
<http://cran.r-project.org/web/packages/gam/gam.pdf>`_
and the
`mgcv package documentation
<http://cran.r-project.org/web/packages/mgcv/mgcv.pdf>`_."""),
arcGISDisplayName=_(u'R package to use'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'where', GAM.FitToArcGISTable,
u'where')
AddArgumentMetadata(GAM.FitToArcGISTable, u'link',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'1/mu^2',
u'cauchit', u'cloglog', u'identity', u'inverse', u'log', u'logit', u'probit',
u'sqrt'], canBeNone=True),
description=_(
u"""Name of the link function for the model. The allowed link
functions depend on the model family:
* binomial family - cauchit, cloglog, log, logit, and probit
* gaussian family - identity, inverse, and log
* Gamma family - identity, inverse, and log
* inverse.gaussian family - 1/mu^2, inverse, identity, and log
* negbin family - identity, log, and sqrt
* poisson family - identity, log, and sqrt
* quasi family - 1/mu^2, cloglog, identity, inverse, log, logit, probit, and
sqrt
* quasibinomial family - cauchit, cloglog, log, logit, and probit
* quasipoisson family - identity, log, and sqrt
If a link function is not specified, the one that is used by default
also depends on the model family:
* binomial family - logit
* gaussian family - identity
* Gamma family - inverse
* inverse.gaussian family - 1/mu^2
* negbin family - log
* poisson family - log
* quasi family - identity
* quasibinomial family - logit
* quasipoisson family - log
"""),
arcGISDisplayName=_(u'Link function'),
arcGISCategory=_(u'Model options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'variance', GAM.FitToArcGISTable,
u'variance')
AddArgumentMetadata(GAM.FitToArcGISTable, u'theta',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True),
description=_(
u"""Theta parameter for the negbin model family.
Theta specifies the amount of dispersion in the data and is required
when fitting a negbin model (it is ignored for all other model
families). It may be specified in several different ways:
* A single number - this number defines the known fixed value of
theta. Use this alternative when you have determined theta yourself.
* Two numbers - these numbers define the range over which mgcv
will search for the optimal theta. Use this alternative when you
have not determined theta yourself. Use the following syntax,
replacing A and B with the two numbers::
c(A,B)
* A list of three or more numbers - these numbers define the possible
values of theta; mgcv will select the optimal value from the list.
Use this alternative when you only want mgcv to consider specific
values. Use the following syntax, replacing, A, B, and so on with
your numbers::
c(A,B,...)
Please see the article titled "GAM negative binomial family" in the
`mgcv package documentation
<http://cran.r-project.org/web/packages/mgcv/mgcv.pdf>`_.
for more information about how to use this parameter."""),
arcGISDisplayName=_(u'Theta parameter for negbin family'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(GAM.FitToArcGISTable, u'method',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True),
description=_(
u"""Smoothing parameter estimation method to use, when fitting the GAM
with the R mgcv package (this parameter is ignored when using the R
gam package).
The methods available depend on the version of mgcv you have
installed. In mgcv version 1.7-5, the following methods were
available:
* GCV.Cp - GCV for Poisson and binomial models and Mallows'
Cp/UBRE/AIC for others. (This is based on the fact that this tool
does not provide a scale parameter when it calls mgcv. The default
value of scale is 0, indicating that the scale should be 1 for
Poisson and binomial models and unknown otherwise.)
* GACV.Cp - equivalent to GCV.Cp, but uses GACV in place of GCV.
* REML - REML estimation.
* P-REML - REML estimation, but using a Pearson estimate of the scale.
* ML - maximum likelihood estimation.
* P-ML - maximum likelihood estimation, but using a Pearson estimate
of the scale.
The default is GCV.Cp."""),
arcGISDisplayName=_(u'Smoothing parameter estimation method'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(GAM.FitToArcGISTable, u'optimizer',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True),
description=_(
u"""Numerical optimization method to use to optimize the smoothing
parameter estimation criterion, when fitting the GAM with the R mgcv
package (this parameter is ignored when using the R gam package).
The optimizers available depend on the version of mgcv you have
installed. In mgcv version 1.7-5, the following optimizers were
available:
* perf - performance iteration.
* outer - a more stable direct approach. This is the default. If this
method is selected, you may also specify an alternative optimizer.
Please consult the mgcv documentation and authors for more information
about these optimizers."""),
arcGISDisplayName=_(u'Optimizer'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(GAM.FitToArcGISTable, u'alternativeOptimizer',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True),
description=_(
u"""Alternative mgcv optimizer to use when the "outer" optimizer is
selected for the previous parameter.
The optimizers available depend on the version of mgcv you have
installed. In mgcv version 1.7-5, the following optimizers were
available:
* newton - the default.
* bfgs
* optim
* nlm
* nlm.fd - the mgcv documentation says this method "is based entirely
on finite differenced derivatives and is very slow".
Please consult the mgcv documentation and authors for more information
about these optimizers."""),
arcGISDisplayName=_(u'Alternative optimizer'),
arcGISCategory=_(u'Model options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'xColumnName',
GAM.FitToArcGISTable, u'xColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'yColumnName',
GAM.FitToArcGISTable, u'yColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'zColumnName',
GAM.FitToArcGISTable, u'zColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'mColumnName',
GAM.FitToArcGISTable, u'mColumnName')
AddArgumentMetadata(GAM.FitToArcGISTable, u'selectionMethod',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
allowedValues=[u'Stepwise backward'], makeLowercase=True),
description=_(
u"""Automated model selection method to execute after fitting the
model using the original formula you specified. If automated model
selection is performed, the output model file will contain the final
model that was selected, and the output summary and plots will be for
this model.
The selection methods currently available are:
* Stepwise backward - backward stepwise model selection by AIC. This
method is only available if the gam was fitted using the R gam
package. The basic idea of this method is to keep dropping model
terms from the original formula so long as a better model fit can be
achieved. Model selection occurs in a loop. First, the Akaike
Information Criterion (AIC) is computed for the current model. Then,
N candidate model are generated, where N equals the number of terms
in the current model. Each candidate model drops a different single
term from the model. The AIC is computed for each candidate model,
and the candidate model that provides the greatest reduction in AIC
becomes the new current model. The loop proceeds until the AIC can
no longer be reduced by dropping terms.
Additional methods may be implemented in a future release of this
tool.
Automated model selection will increase the run time and possibly the
memory utilization of this tool. The amount of increase depends on the
complexity of your model and the amount of data in your table.
For an alternative to automated model selection, you can fit the GAM
using the R mgcv package and use splines with "shrinkage" by setting
the bs parameter of the s function to "ts" for a thin plate regression
spline with shrinkage or "cs" for a cubic regression spline with
shrinkage. For example::
Presence ~ s(SST,bs="ts") + s(Chl,bs="ts") + s(Depth,bs="ts")
After the model is fitted, if the model summary shows that a term has
an effective degrees of freedom (edf) close to zero, the term has been
"zeroed out" and the plot of the term will show a horizontal line at
the x axis. This indicates that the term is a poor predictor of the
response variable and should be removed from the model.
Model selection can be a difficult task. You should never blindly rely
on an automated selection method that you do not fully understand. For
the best results, always consult a statistician."""),
arcGISDisplayName=_(u'Automated model selection method'),
arcGISCategory=_(u'Automated model selection options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'logSelectionDetails',
GAM.FitToArcGISTable, u'logSelectionDetails')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'writeSummaryFile',
GAM.FitToArcGISTable, u'writeSummaryFile')
AddArgumentMetadata(GAM.FitToArcGISTable, u'writeDiagnosticPlots',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True and the R mgcv package was used to fit the GAM, this tool
will write four diagnostic plots for the fitted model. If False or the
R gam package was used to fit the GAM, no plots will be written.
The four plots are written to a single PNG file named X_diag.png,
where X is the name of the output model file, minus any extension. The
plots include a normal Q-Q plot, a plot of residuals vs. linear
predictors, a histogram of residuals, and a plot of the response vs.
the fitted values. For more information about these plots, please see
the documentation for the gam.check function in the
`mgcv package documentation
<http://cran.r-project.org/web/packages/mgcv/mgcv.pdf>`_."""),
arcGISDisplayName=_(u'Write diagnostic plots'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'writeTermPlots',
GAM.FitToArcGISTable, u'writeTermPlots')
AddArgumentMetadata(GAM.FitToArcGISTable, u'residuals',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, residuals will be plotted in model term plots of
continuous terms (i.e. terms that are not factors).
Regardless of what R package was used to fit the GAM, they are
calculated using the method employed by the mgcv package. That is,
they are "the working residuals from the IRLS iteration weighted by
the IRLS weights. Partial residuals for a smooth term [i.e. a term
fitted using the s, lo, or te function] are the residuals that would
be obtained by dropping the term concerned from the model, while
leaving all other estimates fixed (i.e. the estimates for the term
plus the residuals)." (The same method is used for linear terms, as
well.)"""),
arcGISDisplayName=_(u'Plot residuals'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'xAxis', GAM.FitToArcGISTable,
u'xAxis')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'commonScale',
GAM.FitToArcGISTable, u'commonScale')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'plotFileFormat',
GAM.FitToArcGISTable, u'plotFileFormat')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'res', GAM.FitToArcGISTable,
u'res')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'width', GAM.FitToArcGISTable,
u'width')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'height', GAM.FitToArcGISTable,
u'height')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'pointSize',
GAM.FitToArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'bg', GAM.FitToArcGISTable, u'bg')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'overwriteExisting',
GAM.FitToArcGISTable, u'overwriteExisting')
# Public method: GAM.PredictFromArcGISTable
AddMethodMetadata(GAM.PredictFromArcGISTable,
shortDescription=_(u'Given a fitted generalized additive model (GAM),
this tool predicts the response variable for each row of a table.'),
longDescription=_(
u"""If a table is not provided, the prediction will be done on the
training data used to fit the model.
On completion, the tool outputs statistics that summarize how well the
model's predictions match the observed values of the response
variable, unless a table is provided that does not contain the
observed values of the response variable."""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict GAM From Table'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Additive
Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 6, 0),
RPackageDependency(u'ROCR'), RPackageDependency(u'e1071'),
RPackageDependency(u'caret')]) # e1071 is required by caret
CopyArgumentMetadata(GAM.FitToArcGISTable, u'cls',
GAM.PredictFromArcGISTable, u'cls')
AddArgumentMetadata(GAM.PredictFromArcGISTable, u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by the Fit GAM
tool."""),
arcGISDisplayName=_(u'Input model file'))
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'inputTable',
GAM.PredictFromArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'predictedValuesField',
GAM.PredictFromArcGISTable, u'predictedValuesField')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'cutoff',
GAM.PredictFromArcGISTable, u'cutoff')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'where',
GAM.PredictFromArcGISTable, u'where')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'ignoreOutOfRangeValues',
GAM.PredictFromArcGISTable, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'noDataValue',
GAM.PredictFromArcGISTable, u'noDataValue')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'outputPlotFile',
GAM.PredictFromArcGISTable, u'outputPlotFile')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'measure1',
GAM.PredictFromArcGISTable, u'measure1')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'measure2',
GAM.PredictFromArcGISTable, u'measure2')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'colorize',
GAM.PredictFromArcGISTable, u'colorize')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'outputSummaryFile',
GAM.PredictFromArcGISTable, u'outputSummaryFile')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'res',
GAM.PredictFromArcGISTable, u'res')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'width',
GAM.PredictFromArcGISTable, u'width')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'height',
GAM.PredictFromArcGISTable, u'height')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'pointSize',
GAM.PredictFromArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'bg',
GAM.PredictFromArcGISTable, u'bg')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'overwriteExisting',
GAM.PredictFromArcGISTable, u'overwriteExisting')
CopyResultMetadata(GLM.PredictFromArcGISTable, u'updatedTable',
GAM.PredictFromArcGISTable, u'updatedTable')
CopyResultMetadata(GLM.PredictFromArcGISTable, u'outputCutoff',
GAM.PredictFromArcGISTable, u'outputCutoff')
# Public method: GAM.PredictFromArcGISRasters
AddMethodMetadata(GAM.PredictFromArcGISRasters,
shortDescription=_(u'Using a fitted generalized additive model (GAM),
this tool creates a raster representing the response variable predicted from
rasters representing the predictor variables.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict GAM From Rasters'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Additive
Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 6, 0),
RPackageDependency(u'rgdal')])
CopyArgumentMetadata(GAM.FitToArcGISTable, u'cls',
GAM.PredictFromArcGISRasters, u'cls')
CopyArgumentMetadata(GAM.PredictFromArcGISTable, u'inputModelFile',
GAM.PredictFromArcGISRasters, u'inputModelFile')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'outputResponseRaster',
GAM.PredictFromArcGISRasters, u'outputResponseRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'templateRaster',
GAM.PredictFromArcGISRasters, u'templateRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'rasterPredictorNames',
GAM.PredictFromArcGISRasters, u'rasterPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'predictorRasters',
GAM.PredictFromArcGISRasters, u'predictorRasters')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'constantPredictorNames',
GAM.PredictFromArcGISRasters, u'constantPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters,
u'constantPredictorValues', GAM.PredictFromArcGISRasters,
u'constantPredictorValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'cutoff',
GAM.PredictFromArcGISRasters, u'cutoff')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'resamplingTechniques',
GAM.PredictFromArcGISRasters, u'resamplingTechniques')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues',
GAM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'outputErrorRaster',
GAM.PredictFromArcGISRasters, u'outputErrorRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'buildPyramids',
GAM.PredictFromArcGISRasters, u'buildPyramids')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'overwriteExisting',
GAM.PredictFromArcGISRasters, u'overwriteExisting')
# Public method: GAM.BayesPredictFromArcGISRasters
AddMethodMetadata(GAM.BayesPredictFromArcGISRasters,
shortDescription=_(u'Using a binomial generalized additive model (GAM)
fitted using the R mgcv package, this tool creates rasters representing the
estimated probabilities that the response variable will equal or exceed
specified thresholds, given rasters representing the predictor variables.'),
longDescription=_(u'WARNING: This tool is not completely implemented and
will fail if you try to use it.'),
isExposedToPythonCallers=False,
isExposedByCOM=False,
isExposedAsArcGISTool=False, # TODO: Get this tool working again.
#arcGISDisplayName=_(u'Predict Bayesian Probabilities for GAM From
Rasters'),
#arcGISToolCategory=_(u'Statistics\\Model Data\\Generalized Additive
Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 6, 0),
RPackageDependency(u'mgcv'), RPackageDependency(u'MASS'),
RPackageDependency(u'rgdal')])
CopyArgumentMetadata(GAM.FitToArcGISTable, u'cls',
GAM.BayesPredictFromArcGISRasters, u'cls')
AddArgumentMetadata(GAM.BayesPredictFromArcGISRasters, u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by the Fit GAM
tool. The model must be of the binomial family, use a logit link
function, and have been fitted using R mgcv package."""),
arcGISDisplayName=_(u'Input model file'))
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'predictorRasters',
GAM.BayesPredictFromArcGISRasters, u'inputPredictorRasters')
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'rasterPredictorNames',
GAM.BayesPredictFromArcGISRasters, u'variableNames')
AddArgumentMetadata(GAM.BayesPredictFromArcGISRasters, u'thresholds',
typeMetadata=ListTypeMetadata(elementType=FloatTypeMetadata(),
minLength=1),
description=_(
u"""List of thresholds for which probabilities should be estimated.
You must specify one output probability raster for each threshold. See
the documentation for that parameter for more information."""),
arcGISDisplayName=_(u'Thresholds'))
AddArgumentMetadata(GAM.BayesPredictFromArcGISRasters,
u'outputProbabilityRasters',
typeMetadata=ListTypeMetadata(elementType=ArcGISRasterTypeMetadata(mustBeDifferentThanArguments=[u'inputModelFile'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
minLength=1, mustBeSameLengthAsArgument=u'thresholds'),
description=_(
u"""List of output rasters representing the estimated probabilities
that the response variable will meet or exceed the thresholds you
specified, given the predictor rasters you specified. The
probabilities will range from 0 to 1.
For example, if you specify the threshold 0.2, the output probability
raster represents the estimated probability that the response variable
will be in the range 0.2 - 1.0. If a given pixel of the output raster
is close to 0, it indicates high likelihood that the response will be
less than 0.2 at that location, while a value close to 1.0 indicates
high likelihood that the response will be greater than 0.2.
The probability is estimated by comparing the predictor raster values
to samples drawn from the posterior distribution of these predictors
in the model. Please contact the author of this tool for more
information. (I will provide a better explanation once I have obtained
it from the statistician who wrote the algorithm.)
If any pixel is NoData in any predictor raster (after it has been
projected and clipped as needed), the pixel will be NoData in the
output rasters as well. This is because the response variable cannot
be predicted by the model if any of the predictor variables are
missing."""),
direction=u'Output',
arcGISDisplayName=_(u'Output probability rasters'))
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'templateRaster',
GAM.BayesPredictFromArcGISRasters, u'templateRaster')
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'resamplingTechniques',
GAM.BayesPredictFromArcGISRasters, u'resamplingTechniques')
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues',
GAM.BayesPredictFromArcGISRasters, u'ignoreOutOfRangeValues')
AddArgumentMetadata(GAM.BayesPredictFromArcGISRasters, u'samples',
typeMetadata=IntegerTypeMetadata(minValue=1),
description=_(
u"""Number of samples to draw from the posterior distribution when
estimating the probability that a given prediction meets or exceeds
the thresholds you specified.
Large numbers increase statistical accuracy but require more memoryand
processing time than small numbers. The memory increase is linear. I
am not sure about the processing time.
For quick, inaccurate results, use 100 samples. For publishable
results, use 1000 or 10,000. If the tool fails due to insufficient
memory, increase the value of the Split Prediction Into Chunk
parameter. For example, if you increase the number of samples by 10x,
you should also increase the number of chunks by 10x."""),
arcGISDisplayName=_(u'Number of samples from the posterior distribution'),
arcGISCategory=_(u'Prediction options'))
AddArgumentMetadata(GAM.BayesPredictFromArcGISRasters, u'chunks',
typeMetadata=IntegerTypeMetadata(canBeNone=True, minValue=1),
description=_(
u"""Number of chunks to split the prediction into.
Increase this parameter if you find that this tool fails because there
is insufficient memory to perform your prediction. That failure occurs
because the underlying R function that performs the prediction assumes
that infinite memory is available and does not attempt to control its
utilization when performing large predictions. When a value is
provided for this parameter, this tool splits up the prediction job
into the specified number of same-size chunks and invokes the R
function on each chunk. This requires some additional processing but
cuts the memory required for the prediction to that needed for just
one of the chunks.
If you need to use this parameter we recommend that you start with a
value of 100. If that does not work, try 1000, 10000, and so on."""),
arcGISDisplayName=_(u'Split prediction into chunks'),
arcGISCategory=_(u'Prediction options'))
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'buildPyramids',
GAM.BayesPredictFromArcGISRasters, u'buildPyramids')
CopyArgumentMetadata(GAM.PredictFromArcGISRasters, u'overwriteExisting',
GAM.BayesPredictFromArcGISRasters, u'overwriteExisting')
###############################################################################
# Metadata: TreeModel class
###############################################################################
AddClassMetadata(TreeModel,
shortDescription=_(u'Provides methods for modeling and prediction using
tree models (a.k.a. CARTs).'),
isExposedAsCOMServer=True,
comIID=u'{737951E5-51F7-4D54-85E1-ED955485C9CD}',
comCLSID=u'{F6F9A24E-C466-4C15-BF74-D26547854EA0}')
# Public method: TreeModel.FitToArcGISTable
AddMethodMetadata(TreeModel.FitToArcGISTable,
shortDescription=_(u'Fits a tree model to data in a table.'),
longDescription=_(
u"""Tree models were first introduced by Breiman et al. (1984) in the
classic Classification and Regression Tree (CART) software and are
frequently referenced by that name. Since that time, the original
methods have been reimplemented in R and many other statistical
programs. This tool fits tree models using the R rpart package by
Terry M. Therneau and Elizabeth J. Atkinson, and plots them using the
R rpart.plot package by Stephen Milborrow.
**References**
Breiman, L., Friedman, J.H., Olshen, R.A., Stone, C.J. (1984). Classification
and regression trees. Chapman & Hall/CRC.
`An Introduction to Recursive Partitioning Using the RPART Routines
<http://www.mayo.edu/hsr/techrpt/61.pdf>`_
`R rpart package documentation
<http://cran.r-project.org/web/packages/rpart/rpart.pdf>`_"""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Fit Tree Model'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Tree Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 12, 0),
RPackageDependency(u'rpart', u'3.1-48'), RPackageDependency(u'rpart.plot',
u'1.0-0')])
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'cls',
typeMetadata=ClassOrClassInstanceTypeMetadata(cls=TreeModel),
description=_(u'%s class or an instance of it.') % TreeModel.__name__)
CopyArgumentMetadata(GLM.FitToArcGISTable, u'inputTable',
TreeModel.FitToArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'outputModelFile',
TreeModel.FitToArcGISTable, u'outputModelFile')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'formula',
TreeModel.FitToArcGISTable, u'formula')
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'method',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'ANOVA', u'Class',
u'Exp', u'Poisson'], makeLowercase=True),
description=_(
u"""Method to use for splitting the tree, one of:
* ANOVA - Use this method to build a regression tree, i.e. when you
are modeling a continuous response variable, such as the abundance
of a species. With this method, the splits will be chosen to
maximize the between-groups sum-of-squares in a simple analysis of
variance.
* Class - Use this method to build a classification tree, i.e. when
you are modeling a categorical response variable, such as the
presence or absence of a species. When this method is selected, the
response variable is assumed to be categorical and the R factor
function is automatically applied to it.
* Exp - Use this method to build a regression tree using exponential
scaling. For more information about this method, please see the
references below.
* Poisson - Use this method to build a regression tree using Poisson
regression, which is appropriate for event rate data. For more
information about this method, please see the references below.
**References**
`An Introduction to Recursive Partitioning Using the RPART Routines
<http://www.mayo.edu/hsr/techrpt/61.pdf>`_
`R rpart package documentation
<http://cran.r-project.org/web/packages/rpart/rpart.pdf>`_"""),
arcGISDisplayName=_(u'Splitting method'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'where',
typeMetadata=SQLWhereClauseTypeMetadata(canBeNone=True),
description=ArcGIS91SelectCursor.__init__.__doc__.Obj.GetArgumentByName(u'where').Description,
arcGISParameterDependencies=[u'inputTable'],
arcGISDisplayName=_(u'Where clause'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'allowMissingCovariates',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If this option is enabled (the default), records will be included
in the model fitting process so long as they have a value for the
response variable and at least one predictor variable. If this option
is disabled, records must have values for the response variable and
all predictor variables in order to be included.
The R rpart package that is used to fit the model has the novel
capability of allowing records that are missing some data to still
participate in the model fitting process. For more information about
how this works, please see
`An Introduction to Recursive Partitioning Using the RPART Routines
<http://www.mayo.edu/hsr/techrpt/61.pdf>`_."""),
arcGISDisplayName=_(u'Include records that are missing covariates'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'minSplit',
typeMetadata=IntegerTypeMetadata(minValue=2),
description=_(
u"""The minimum number of observations that must exist in a node of
the tree in order for a split of that node to be attempted. The
default value, 20, was taken from the R rpart package that is used to
fit the model."""),
arcGISDisplayName=_(u'Minimum number of observations to attempt a split'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'minBucket',
typeMetadata=IntegerTypeMetadata(minValue=1),
description=_(
u"""The minimum number of observations that may be in any leaf node of
the tree. The default value, 7, was taken from the R rpart package
that is used to fit the model. By default, rpart recommends that this
parameter be set to one third of the previous parameter."""),
arcGISDisplayName=_(u'Minimum number of observations in a leaf node'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'cp',
typeMetadata=FloatTypeMetadata(minValue=0.),
description=_(
u"""Any split that does not decrease the overall lack of fit by a
factor of this parameter will not be attempted. For instance, with
ANOVA splitting, this means that the overall Rsquare must increase by
this parameter at each step. The main role of this parameter is to
save computing time by pruning off splits that are obviously not
worthwhile. Essentially, you inform the tool that any split which does
not improve the fit by this parameter will likely be pruned off by
cross-validation, and that hence the tool need not pursue it.
The default value, 0.01, was taken from the R rpart package that is
used to fit the model."""),
arcGISDisplayName=_(u'Complexity parameter'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'maxCompete',
typeMetadata=IntegerTypeMetadata(minValue=0),
description=_(
u"""The number of competitor splits to retain in the output. It is
useful to know not just which split was chosen, but which variable
came in second, third, etc. The default value, 4, was taken from the R
rpart package that is used to fit the model."""),
arcGISDisplayName=_(u'Number of competitor splits to retain'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'maxSurrogate',
typeMetadata=IntegerTypeMetadata(minValue=0),
description=_(
u"""The number of surrogate splits to retain in the output. If this is
set to zero the compute time will be shortened, since approximately
half of the computational time (other than setup) is used in the
search for surrogate splits. The default value, 5, was taken from the
R rpart package that is used to fit the model."""),
arcGISDisplayName=_(u'Number of surrogate splits to retain'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'useSurrogate',
typeMetadata=IntegerTypeMetadata(allowedValues=[0, 1, 2]),
description=_(
u"""The method for using surrogates in the splitting process, one of:
* 0 - display only; an observation with a missing value for the
primary split rule is not sent further down the tree.
* 1 - use surrogates, in order, to split subjects missing the primary
variable; if all surrogates are missing the observation is not
split.
* 2 - if all surrogates are missing, then send the observation in the
majority direction. This is the recommendations of Breiman, et al.
The default value, 2, was taken from the R rpart package that is used
to fit the model."""),
arcGISDisplayName=_(u'Surrogate usage method'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'surrogateStyle',
typeMetadata=IntegerTypeMetadata(allowedValues=[0, 1]),
description=_(
u"""The method used to select the best surrogate, one of:
* 0 - the tool uses the total number of correct classifications for a
potential surrogate variable.
* 1 - the tool uses the percent correct, calculated over the
non-missing values of the surrogate.
The default value, 0, was taken from the R rpart package that is used
to fit the model. This value more severely penalizes covariates with a
large number of missing values."""),
arcGISDisplayName=_(u'Surrogate selection method'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'xval',
typeMetadata=IntegerTypeMetadata(minValue=1),
description=_(
u"""The number of cross-validations to perform.
The R rpart package that is used to fit the model uses a default of
10, but we have found that so few iterations can cause the calculated
cross-validation error to differ substantially over several runs of
the tool using identical input data and parameter values. In one case,
we observed the cross-validation errors to vary by over 10%. Because
the cross-validation errors are often used to prune the tree, we
believe it is important to have accurate estimates of them, so we
increased the default to 1000. This will cause complicated models to
run substantially slower. If you find your model is too slow, decrease
the value."""),
arcGISDisplayName=_(u'Number of cross-validations'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'maxDepth',
typeMetadata=IntegerTypeMetadata(minValue=0, maxValue=30),
description=_(
u"""The maximum depth of any node of the final tree, with the root
node counted as depth 0. The default value, 30, was taken from the R
rpart package that is used to fit the model."""),
arcGISDisplayName=_(u'Maximum tree depth'),
arcGISCategory=_(u'Tree growing options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'pruningMethod',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
allowedValues=[u'Minimum error', u'1-SE rule', u'Interactive', u'User
specified CP'], makeLowercase=True),
description=_(
u"""Method to use for pruning the tree, one of:
* Minimum error - The tool will prune the tree using the complexity
parameter associated with the minimum cross-validation error.
* 1-SE rule - The tool will prune the tree using the complexity
parameter obtained using the 1-SE rule described by Therneau and
Atkinson in chapter 4 of
`An Introduction to Recursive Partitioning Using the RPART Routines
<http://www.mayo.edu/hsr/techrpt/61.pdf>`_.
A plot of the mean cross-validation errors versus candidate values
of the complexity parameter often has an initial sharp drop followed
by a relatively flat plateau and then a slow rise. (This tool
produces that plot as an optional diagnostic output.) According to
the 1-SE rule, any cross-validation error within one standard error
of the minimum cross-validation error is considered equivalent to
the minimum (i.e. considered to be part of the flat plateau). The
1-SE rule chooses the largest complexity parameter that yields a
cross-validation error equivalent ot the minimum. This results in an
optimal tree, i.e. the tree with the fewest number of splits that
yields a cross-validation error equivalent to the minimum.
* Interactive - The tool will display the unpruned tree in a window,
allowing you to prune it interactively with the mouse. If you click
on a split it will be marked as deleted. If you click on an
already-deleted split it will be undeleted (if its parent is not
deleted). Information about the node is printed as you click. When
you have finished pruning, click on the QUIT button.
* User specified CP - The tool will prune the tree using the
complexity parameter you specify below.
If this parameter is omitted, the tree will not be pruned."""),
arcGISDisplayName=_(u'Pruning method'),
arcGISCategory=_(u'Tree pruning options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'pruningCP',
typeMetadata=FloatTypeMetadata(canBeNone=True, mustBeGreaterThan=0.),
description=_(
u"""Complexity parameter for pruning the tree. This parameter is only
used when the Pruning Method is set to 'User specified'."""),
arcGISDisplayName=_(u'Complexity parameter for pruning'),
arcGISCategory=_(u'Tree pruning options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'xColumnName',
TreeModel.FitToArcGISTable, u'xColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'yColumnName',
TreeModel.FitToArcGISTable, u'yColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'zColumnName',
TreeModel.FitToArcGISTable, u'zColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'mColumnName',
TreeModel.FitToArcGISTable, u'mColumnName')
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'writeSummaryFile',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write summary information about the fitted
model to a text file. (This is the same information that the tool
outputs as log messages.) The file will have the name X_summary.txt,
where X is the name of the output model file, minus any
extension."""),
arcGISDisplayName=_(u'Write model summary file'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'writeDiagnosticPlots',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write diagnostic plots:
* X_cp.Y - visual representation of the cross-validation results for
the unpruned tree, to assist you with choosing a Complexity
Parameter for pruning the tree. The x-axis represents possible
choices for the Complexity Parameter and the y-axis represents the
means and standard deviations of the errors in the cross-validated
prediction that would result. The dashed horizontal line is drawn 1
standard error above the minimum of the curve. A good choice of the
Complexity Parameter for pruning is the leftmost value for which the
mean error lies below the line. This value will be chosen
automatically if the Pruning Method parameter is set to '1-SE rule'.
* X_rsquare.Y - two-panel plot only produced for the ANOVA splitting
method. The first panel shows the r-square (both apparent and
apparent from cross-validation) versus the number of splits. The
second panel shows the mean error in the cross-validated prediction
versus the number of splits (this is essentially the same plot as
the X_cp.Y plot described above). Both panels are produced for the
unpruned tree.
* X_residuals.Y - plot of the residuals vs. the fitted values for the
unpruned tree.
* X_pruned_residuals.Y - plot of the residuals vs. the fitted values
for the pruned tree. This plot will only be produced if the tree is
pruned.
In the file names above, X is the name of the output model file, minus
any extension, and Y is the extension of the selected output plot
format."""),
arcGISDisplayName=_(u'Write diagnostic plots'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'writeTreePlot',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write a plot of the unpruned tree to a
file having the name X_unpruned_tree.Y, where X is the name of the
output model file minus the extension and Y is the extension of the
selected output plot format."""),
arcGISDisplayName=_(u'Write tree plot'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'writePrunedTreePlot',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write a plot of the pruned tree to a file
having the name X_pruned_tree.Y, where X is the name of the output
model file minus the extension and Y is the extension of the selected
output plot format. This plot will only be produced if the tree is
pruned."""),
arcGISDisplayName=_(u'Write pruned tree plot'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'plotFileFormat',
TreeModel.FitToArcGISTable, u'plotFileFormat')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'res',
TreeModel.FitToArcGISTable, u'res')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'width',
TreeModel.FitToArcGISTable, u'width')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'height',
TreeModel.FitToArcGISTable, u'height')
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'pointSize',
typeMetadata=FloatTypeMetadata(minValue=1.0),
description=_(
u"""The default pointsize of text in diagnostic plots (the size of the
text in tree plots is controlled by a different parameter)."""),
arcGISDisplayName=_(u'Default pointsize of text in diagnostic plots'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'bg', TreeModel.FitToArcGISTable,
u'bg')
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'treePlotType',
typeMetadata=IntegerTypeMetadata(allowedValues=[0, 1, 2, 3, 4]),
description=_(
u"""Type of tree plots to create, one of:
* 0 - The default. Draw a split label at each split and a node label
at each leaf.
* 1 - Label all nodes, not just leaves.
* 2 - Like 1 but draw the split labels below the node labels. Similar
to the plots in the CART book.
* 3 - Draw separate split labels for the left and right directions.
* 4 - Like 3 but label all nodes, not just leaves.
"""),
arcGISDisplayName=_(u'Plot type'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'extra',
typeMetadata=IntegerTypeMetadata(allowedValues=[0, 1, 2, 3, 4, 5, 6, 7,
8, 9]),
description=_(
u"""Extra information to display at the nodes, one of:
* 0 - No extra information.
* 1 - The default. Display the number of observations that fall in the
node (per class for Class models; prefixed by the number of events
for Poisson and Exp models).
* 2 - Class models: display the classification rate at the node, expressed as
the
number of correct classifications and the number of observations in the
node.
Poisson and Exp models: display the number of events.
* 3 - Class models only: misclassification rate at the node, expressed
as the number of incorrect classifications and the number of
observations in the node.
* 4 - Class models only: probability per class of observations in the
node (conditioned on the node, sum across a node is 1).
* 5 - Class models only: like 4 but do not display the fitted class.
* 6 - Class models only: the probability of the second class only.
Useful for binary responses.
* 7 - Class models only: like 6 but do not display the fitted class.
* 8 - Class models only: the probability of the fitted class.
* 9 - Class models only: the probabilities times the fraction of
observations in the node (the probability relative to all
observations, sum across all leaves is 1).
"""),
arcGISDisplayName=_(u'Extra information'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'percentage',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, nodes will be labeled with the percentage of
observations in the node. The percentage will be displayed below the
"extra information" (if any is requested)."""),
arcGISDisplayName=_(u'Display percentage of observations'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'under',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, extra information and percentage of
observations will be displayed below the nodes. If False, they will be
displayed within the nodes' boxes.
This parameter is ignored if neither extra information nor percentage
of observations are requested."""),
arcGISDisplayName=_(u'Display extra text under node boxes'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'clipRightLabels',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, the right-hand split labels on plots of type
3 or 4 will not include "variable=". If False, the right-hand labels
will include "variable=", just like the left-hand labels.
This parameter is ignored the plot type is not 3 or 4."""),
arcGISDisplayName=_(u'Clip right-hand split labels'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'fallenLeaves',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, all leaf nodes will be displayed at the bottom. If False,
the default, leaf nodes will be displayed where they would normally
appear."""),
arcGISDisplayName=_(u'Display leaves at bottom'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'branchType',
typeMetadata=IntegerTypeMetadata(allowedValues=[0, 1, 2, 3, 4, 5, 6, 7,
8, 9]),
description=_(
u"""Type of branches to draw. If zero, the default, the tool will draw
conventional branches having a constant narrow width. If nonzero, the tool
will draw "wide branches", with branch widths proportional to the
specified parameter, one of:
* 1 - Deviance
* 2 - Square root of deviance
* 3 - Deviance / number of observations
* 4 - Square root of (deviance / number of observations)
* 5 - Number of observations
* 6 - Complexity parameter
* 7 - Absolute value of the predicted value
* 8 - Predicted value minus the minimum predicted value
* 9 - Constant wide width, for checking the visual distortion that
results when wide branches are drawn at different angles
"""),
arcGISDisplayName=_(u'Branch type'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'branch',
typeMetadata=FloatTypeMetadata(minValue=0., maxValue=1.),
description=_(
u"""Controls the shape of the branches from parent to child nodes. Any
number from 0 to 1 is allowed. A value of 1 gives square shouldered
branches, a value of 0 give V shaped branches, with other values being
intermediate.
Note that if the Branch Type parameter is nonzero, the Branch Shape
parameter will be rounded to 1 or 0 (e.g. a Branch Shape of 0.75 will
be rounded to 1)."""),
arcGISDisplayName=_(u'Branch shape'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'uniform',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, the vertical spacing of the nodes will be
uniform. If False, the nodes will be spaced proportionally to the fit
(more precisely, to the difference between a node's deviance and the
sum of its children's deviances). Small spaces must be expanded to
leave room for the labels.
Note: if this parameter is False and the Text Magnification Factor is
omitted (the default), very small text can sometimes result."""),
arcGISDisplayName=_(u'Use uniform vertical spacing'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'digits',
typeMetadata=IntegerTypeMetadata(minValue=0),
description=_(
u"""Number of significant digits to display in floating-point numbers.
Probabilities and percentages are treated specially. Probabilities are
displayed with the specified number of digits after the decimal point
(by default 2 digits). Percentages are displayed with the specified
number of digits minus 2 after the decimal point (by default no
digits)."""),
arcGISDisplayName=_(u'Significant digits for labels'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'varlen',
typeMetadata=IntegerTypeMetadata(),
description=_(
u"""Length of variable names in text at the splits (and, for class
responses, the class displayed at the node). There are three
possibilities:
* 0 - The default. Use full names.
* >0 - Use an abbreviation algorithm to shorten the names to at
least the specified number, such that they remain unique.
* <0 - Truncate names to the shortest length where they are still
unique, but never truncate to shorter than the specified number
(e.g. the value -5 means never truncate to shorter than 5
characters).
"""),
arcGISDisplayName=_(u'Length of variable names at splits'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'faclen',
typeMetadata=IntegerTypeMetadata(),
description=_(
u"""Length of factor level names (i.e. categorical variable values) in
splits. There are four possibilities:
* 0 - The default. Use full names.
* 1 - Represent factor levels with alphabetic characters (a for the
first level, b for the second, and so on).
* >1 - Use an abbreviation algorithm to shorten the names to at
least the specified number, such that they remain unique.
* <0 - Truncate names to the shortest length where they are still
unique, but never truncate to shorter than the specified number
(e.g. the value -5 means never truncate to shorter than 5
characters).
"""),
arcGISDisplayName=_(u'Length of factor level names in splits'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'cex',
typeMetadata=FloatTypeMetadata(canBeNone=True, mustBeGreaterThan=0.),
description=_(
u"""A numerical value giving the amount by which text should be
magnified relative to the default. If omitted, the default, the text
size will be calculated automatically.
The default automatic calculation means that this seemingly innocuous
argument has a far reaching effect. If necessary it will trigger the
node shifting engine to get a decent type size (see the Compress Tree
Vertically parameter)."""),
arcGISDisplayName=_(u'Text magnification factor'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'tweak',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""Adjust the (possibly automatically calculated) Text Magnification
Factor. For example, use 1.1 to make the text 10% larger. The default
is 1, meaning no adjustment.
Note that font sizes are discrete, so the Text Magnification Factor
you ask for may not be the one you get. And a small tweak may not
actually change the type size or change it more than you want."""),
arcGISDisplayName=_(u'Tweak text magnification'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'compress',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, the tree will be compressed horizontally by
shifting nodes horizontally where space is available."""),
arcGISDisplayName=_(u'Compress tree horizontally'),
arcGISCategory=_(u'Tree plot options'))
AddArgumentMetadata(TreeModel.FitToArcGISTable, u'ycompress',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, and the initial automatically calculated
Text Magnification Factor is less than 0.7, crowded labels will be
shifted vertically where space is available. This often allows
considerably larger text.
Set this parameter to False if you fell the resulting plot is too
messy. The shifting algorithm may work a little better (allowing
larger text) for plot types 1, 2, and 3."""),
arcGISDisplayName=_(u'Compress tree vertically'),
arcGISCategory=_(u'Tree plot options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'overwriteExisting',
TreeModel.FitToArcGISTable, u'overwriteExisting')
# Public method: TreeModel.PredictFromArcGISTable
AddMethodMetadata(TreeModel.PredictFromArcGISTable,
shortDescription=_(u'Given a fitted tree model, this tool predicts the
response variable for each row of a table.'),
longDescription=_(
u"""If a table is not provided, the prediction will be done on the
training data used to fit the model.
On completion, the tool outputs statistics that summarize how well the
model's predictions match the observed values of the response
variable, unless a table is provided that does not contain the
observed values of the response variable."""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict Tree Model From Table'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Tree Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 12, 0),
RPackageDependency(u'rpart', u'3.1-48'), RPackageDependency(u'ROCR'),
RPackageDependency(u'e1071'), RPackageDependency(u'caret')]) # e1071 is
required by caret
CopyArgumentMetadata(TreeModel.FitToArcGISTable, u'cls',
TreeModel.PredictFromArcGISTable, u'cls')
AddArgumentMetadata(TreeModel.PredictFromArcGISTable, u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by the Fit Tree
Model tool."""),
arcGISDisplayName=_(u'Input model file'))
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'inputTable',
TreeModel.PredictFromArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'predictedValuesField',
TreeModel.PredictFromArcGISTable, u'predictedValuesField')
AddArgumentMetadata(TreeModel.PredictFromArcGISTable, u'cutoff',
typeMetadata=FloatTypeMetadata(minValue=0., maxValue=1., canBeNone=True),
description=_(
u"""Cutoff to use when classifying the continuous probability output
by a binary classification tree into an integer result (0 or 1).
Probabilities greater than or equal to the cutoff are classified as 1;
probabilities less than the cutoff are classified as 0.
This parameter should not be used for regression trees or
classification trees that have more than two classes.
For binary classification trees, if a value is not provided, the tool
will automatically select the value that maximizes the value of the
Youden index (see Perkins and Schisterman, 2006), thereby attempting
to minimize the misclassification rate of the model. This approach may
not be optimal for your application; we encourage you to review the
extensive discusson of cutoffs in the scientific literature and select
a value deliberately.
References
Perkins NJ, Schisterman EF (2006) The Inconsistency of "Optimal"
Cutpoints Obtained using Two Criteria based on the Receiver Operating
Characteristic Curve. American Journal of Epidemiology 163:
670-675."""),
arcGISDisplayName=_(u'Binary classification cutoff'))
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'where',
TreeModel.PredictFromArcGISTable, u'where')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'ignoreOutOfRangeValues',
TreeModel.PredictFromArcGISTable, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'noDataValue',
TreeModel.PredictFromArcGISTable, u'noDataValue')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'outputPlotFile',
TreeModel.PredictFromArcGISTable, u'outputPlotFile')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'measure1',
TreeModel.PredictFromArcGISTable, u'measure1')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'measure2',
TreeModel.PredictFromArcGISTable, u'measure2')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'colorize',
TreeModel.PredictFromArcGISTable, u'colorize')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'outputSummaryFile',
TreeModel.PredictFromArcGISTable, u'outputSummaryFile')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'res',
TreeModel.PredictFromArcGISTable, u'res')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'width',
TreeModel.PredictFromArcGISTable, u'width')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'height',
TreeModel.PredictFromArcGISTable, u'height')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'pointSize',
TreeModel.PredictFromArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'bg',
TreeModel.PredictFromArcGISTable, u'bg')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'overwriteExisting',
TreeModel.PredictFromArcGISTable, u'overwriteExisting')
CopyResultMetadata(GLM.PredictFromArcGISTable, u'updatedTable',
TreeModel.PredictFromArcGISTable, u'updatedTable')
CopyResultMetadata(GLM.PredictFromArcGISTable, u'outputCutoff',
TreeModel.PredictFromArcGISTable, u'outputCutoff')
# Public method: TreeModel.PredictFromArcGISRasters
AddMethodMetadata(TreeModel.PredictFromArcGISRasters,
shortDescription=_(u'Using a fitted tree model, this tool creates a
raster representing the response variable predicted from rasters representing
the predictor variables.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict Tree Model From Rasters'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Tree Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 12, 0),
RPackageDependency(u'rpart', u'3.1-48'), RPackageDependency(u'rgdal')])
CopyArgumentMetadata(TreeModel.FitToArcGISTable, u'cls',
TreeModel.PredictFromArcGISRasters, u'cls')
CopyArgumentMetadata(TreeModel.PredictFromArcGISTable, u'inputModelFile',
TreeModel.PredictFromArcGISRasters, u'inputModelFile')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'outputResponseRaster',
TreeModel.PredictFromArcGISRasters, u'outputResponseRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'templateRaster',
TreeModel.PredictFromArcGISRasters, u'templateRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'rasterPredictorNames',
TreeModel.PredictFromArcGISRasters, u'rasterPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'predictorRasters',
TreeModel.PredictFromArcGISRasters, u'predictorRasters')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'constantPredictorNames',
TreeModel.PredictFromArcGISRasters, u'constantPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters,
u'constantPredictorValues', TreeModel.PredictFromArcGISRasters,
u'constantPredictorValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'cutoff',
TreeModel.PredictFromArcGISRasters, u'cutoff')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'resamplingTechniques',
TreeModel.PredictFromArcGISRasters, u'resamplingTechniques')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues',
TreeModel.PredictFromArcGISRasters, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'buildPyramids',
TreeModel.PredictFromArcGISRasters, u'buildPyramids')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'overwriteExisting',
TreeModel.PredictFromArcGISRasters, u'overwriteExisting')
###############################################################################
# Metadata: LinearMixedModel class
###############################################################################
AddClassMetadata(LinearMixedModel,
shortDescription=_(u'Provides methods for fitting and predicting linear
mixed-effects models.'),
isExposedAsCOMServer=True,
comIID=u'{4678EDA8-073A-496B-81B6-6DC8F47E4EC3}',
comCLSID=u'{A65B2686-1240-44E8-8764-397EB58BFCD7}')
# Public method: LinearMixedModel.FitToArcGISTable
AddMethodMetadata(LinearMixedModel.FitToArcGISTable,
shortDescription=_(u'Fits a linear mixed-effects model to data in a
table.'),
longDescription=_(
u"""This tool fits a linear mixed-effects model using the lme function
from the R `nlme package
<http://cran.r-project.org/web/packages/nlme/nlme.pdf>`_
written by Jose Pinherio and Douglas Bates (2000). Please see the
documentation for that function for more information.
**References**
The R documentation for the nlme package states the following about
the lme function: The computational methods follow the general
framework of Lindstrom and Bates (1988). The model formulation is
described in Laird and Ware (1982). The variance-covariance
parametrizations are described in Pinheiro and Bates (1996). The
different correlation structures that may be used are described in
Box, Jenkins and Reinse (1994), Littel et al (1996), and Venables and
Ripley, (1997). The use of variance functions for linear and nonlinear
mixed effects models is presented in detail in Davidian and Giltinan
(1995).
Box, G.E.P., Jenkins, G.M., and Reinsel G.C. (1994) "Time Series
Analysis: Forecasting and Control", 3rd Edition, Holden-Day.
Davidian, M. and Giltinan, D.M. (1995) "Nonlinear Mixed Effects Models
for Repeated Measurement Data", Chapman and Hall.
Laird, N.M. and Ware, J.H. (1982) "Random-Effects Models for
Longitudinal Data", Biometrics, 38, 963-974.
Lindstrom, M.J. and Bates, D.M. (1988) "Newton-Raphson and EM
Algorithms for Linear Mixed-Effects Models for Repeated-Measures
Data", Journal of the American Statistical Association, 83, 1014-1022.
Littel, R.C., Milliken, G.A., Stroup, W.W., and Wolfinger, R.D. (1996)
"SAS Systems for Mixed Models", SAS Institute.
Pinheiro, J.C. and Bates., D.M. (1996) "Unconstrained Parametrizations
for Variance-Covariance Matrices", Statistics and Computing, 6,
289-296.
Pinheiro, J.C., and Bates, D.M. (2000) "Mixed-Effects Models in S and
S-PLUS", Springer.
Venables, W.N. and Ripley, B.D. (2002) "Modern Applied Statistics with
S", 4th Edition, Springer-Verlag.
`R nlme package documentation
<http://cran.r-project.org/web/packages/nlme/nlme.pdf>`_"""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Fit Linear Mixed Model'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Linear Mixed Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 7, 0)])
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'cls',
typeMetadata=ClassOrClassInstanceTypeMetadata(cls=LinearMixedModel),
description=_(u'%s class or an instance of it.') %
LinearMixedModel.__name__)
CopyArgumentMetadata(GLM.FitToArcGISTable, u'inputTable',
LinearMixedModel.FitToArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'outputModelFile',
LinearMixedModel.FitToArcGISTable, u'outputModelFile')
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'fixedFormula',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""Two-sided formula that specifies the fixed-effects part of the
model.
""" + _FormulaDescription),
arcGISDisplayName=_(u'Fixed effects formula'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'randomFormula',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""One-sided formula that specifies the random-effects part of the
model. The formula has the form::
~x1+...+xn | g1/.../gm
with x1+...+xn specifying the model for the random effects and
g1/.../gm the grouping structure (m may be equal to 1, in which case
no / is required). The random effects formula will be repeated for all
levels of grouping, in the case of multiple levels of grouping."""),
arcGISDisplayName=_(u'Random effects formula'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'where',
LinearMixedModel.FitToArcGISTable, u'where')
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'method',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'REML', u'ML']),
description=_(
u"""Model fitting method, one of:
* REML - the model will be fit by maximizing the restricted
log-likelihood. This is the default.
* ML - the model will be fit by maximizing the log-likelihood.
"""),
arcGISDisplayName=_(u'Model fitting method'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable,
u'correlationStructure',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
makeLowercase=True, allowedValues=[u'Exponential', u'Gaussian', u'Linear',
u'Rational quadratic', u'Spherical']),
description=_(
u"""TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Correlation structure'),
arcGISCategory=_(u'Within-group correlation structure'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'correlationFormula',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True),
description=_(
u"""TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Correlation formula'),
arcGISCategory=_(u'Within-group correlation structure'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'range_',
typeMetadata=FloatTypeMetadata(canBeNone=True, mustBeGreaterThan=0.),
description=_(
u"""TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Range'),
arcGISCategory=_(u'Within-group correlation structure'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'nugget',
typeMetadata=FloatTypeMetadata(canBeNone=True, minValue=0., maxValue=1.),
description=_(
u"""TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Nugget'),
arcGISCategory=_(u'Within-group correlation structure'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'metric',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
makeLowercase=True, allowedValues=[u'Euclidean', u'Manhattan', u'Maximum']),
description=_(
u"""TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Distance metric'),
arcGISCategory=_(u'Within-group correlation structure'))
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'fixed',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Keep coefficients fixed during model optimization'),
arcGISCategory=_(u'Within-group correlation structure'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'xColumnName',
LinearMixedModel.FitToArcGISTable, u'xColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'yColumnName',
LinearMixedModel.FitToArcGISTable, u'yColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'zColumnName',
LinearMixedModel.FitToArcGISTable, u'zColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'mColumnName',
LinearMixedModel.FitToArcGISTable, u'mColumnName')
CopyArgumentMetadata(TreeModel.FitToArcGISTable, u'writeSummaryFile',
LinearMixedModel.FitToArcGISTable, u'writeSummaryFile')
AddArgumentMetadata(LinearMixedModel.FitToArcGISTable,
u'writeDiagnosticPlots',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, this tool will write diagnostic plots for the fitted
model.
TODO: Write documentation for this parameter."""),
arcGISDisplayName=_(u'Write diagnostic plots'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'plotFileFormat',
LinearMixedModel.FitToArcGISTable, u'plotFileFormat')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'res',
LinearMixedModel.FitToArcGISTable, u'res')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'width',
LinearMixedModel.FitToArcGISTable, u'width')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'height',
LinearMixedModel.FitToArcGISTable, u'height')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'pointSize',
LinearMixedModel.FitToArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'bg',
LinearMixedModel.FitToArcGISTable, u'bg')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'overwriteExisting',
LinearMixedModel.FitToArcGISTable, u'overwriteExisting')
# Public method: LinearMixedModel.PredictFromArcGISRasters
AddMethodMetadata(LinearMixedModel.PredictFromArcGISRasters,
shortDescription=_(u'Using a fitted linear mixed model, this tool creates
a raster representing the response variable predicted from rasters
representing the predictor variables.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict Linear Mixed Model From Rasters'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Linear Mixed Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 7, 0),
RPackageDependency(u'rgdal')])
CopyArgumentMetadata(LinearMixedModel.FitToArcGISTable, u'cls',
LinearMixedModel.PredictFromArcGISRasters, u'cls')
AddArgumentMetadata(LinearMixedModel.PredictFromArcGISRasters,
u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by the Fit Linear
Mixed Model tool."""),
arcGISDisplayName=_(u'Input model file'))
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'outputResponseRaster',
LinearMixedModel.PredictFromArcGISRasters, u'outputResponseRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'templateRaster',
LinearMixedModel.PredictFromArcGISRasters, u'templateRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'rasterPredictorNames',
LinearMixedModel.PredictFromArcGISRasters, u'rasterPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'predictorRasters',
LinearMixedModel.PredictFromArcGISRasters, u'predictorRasters')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'constantPredictorNames',
LinearMixedModel.PredictFromArcGISRasters, u'constantPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters,
u'constantPredictorValues', LinearMixedModel.PredictFromArcGISRasters,
u'constantPredictorValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'cutoff',
LinearMixedModel.PredictFromArcGISRasters, u'cutoff')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'resamplingTechniques',
LinearMixedModel.PredictFromArcGISRasters, u'resamplingTechniques')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues',
LinearMixedModel.PredictFromArcGISRasters, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'buildPyramids',
LinearMixedModel.PredictFromArcGISRasters, u'buildPyramids')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'overwriteExisting',
LinearMixedModel.PredictFromArcGISRasters, u'overwriteExisting')
###############################################################################
# Metadata: RandomForestModel class
###############################################################################
AddClassMetadata(RandomForestModel,
shortDescription=_(u'Provides methods for modeling and prediction using
random forest models.'),
isExposedAsCOMServer=True,
comIID=u'{E37F4014-7B74-42A8-877E-6928DA6553B3}',
comCLSID=u'{D709A81C-46FB-45FC-80D5-9B1EEA54A068}')
# Public method: RandomForestModel.FitToArcGISTable
AddMethodMetadata(RandomForestModel.FitToArcGISTable,
shortDescription=_(u'Fits a random forest model to data in a table.'),
longDescription=_(
u"""Random forest (Breiman, 2001) is machine learning algorithm that
fits many classification or regression tree (CART) models to random
subsets of the input data and uses the combined result (the forest)
for prediction. For a detailed description of random forests and
practical advice their application in ecology, see Cutler et al.
(2007).
This tool fits a classification or regression forest using either the
R randomForest package (Liaw and Wiener, 2002) which implements
Breiman's classic algorithm, or the cforest function from the R party
package (Hothorn et al, 2006; Strobl et al, 2007; Strobl et al, 2008).
A principal feature of random forests is their ability to estimate the
importance of each predictor variable in modeling the response
variable. Strobl et al. (2007, 2008) found that the randomForest
package produces poor estimates in certain scenarios. The party
package provides a solution that uses conditional inference trees and
importance estimates, making it an attractive alternative to
randomForest. The party package does suffer from two drawbacks,
however: it does not produce the same diagnostic plots as
randomForest, and it requires more processing time and much more
memory than randomForest. If the input table has thousands of records,
the party package may simply not have enough memory to run.
**References**
Breiman, L. (2001). Random forests. Machine Learning, 45: 5-32.
Cutler, D.R., Edwards Jr., T.C., Beard, K.H., Cutler, A., Hess, K.T.,
Gibson, J., and Lawler, J.J. (2007). Random Forests for Classification
in Ecology. Ecology 88: 2783-2792.
Hothorn, T., Buehlmann, P., Dudoit, S., Molinaro, A., and Van Der
Laan, M. (2006). Survival Ensembles. Biostatistics 7: 355-373.
Liaw, A. and Wiener, M. (2002). Classification and Regression by randomForest.
`R News 2 <http://www.r-project.org/doc/Rnews/Rnews_2002-3.pdf>`_: 18-22.
Strobl, S., Boulesteix, A.-L., Kneib, T., Augustin, T., and Zeileis,
A. (2008). Conditional Variable Importance for Random Forests.
`BMC Bioinformatics 9:307 <http://www.biomedcentral.com/1471-2105/9/307>`_.
Strobl, S., Boulesteix, A.-L., Zeileis, A., and Hothorn, T. (2007).
Bias in Random Forest Variable Importance Measures: Illustrations,
Sources and a Solution. `BMC Bioinformatics 8:25
<http://www.biomedcentral.com/1471-2105/8/25>`_.
`R party package documentation
<http://cran.r-project.org/web/packages/party/party.pdf>`_
`R randomForest package documentation
<http://cran.r-project.org/web/packages/randomForest/randomForest.pdf>`_
"""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Fit Random Forest Model'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Random Forest Models'),
dependencies=[ArcGISDependency(9, 1), RDependency(2, 9, 0)])
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'cls',
typeMetadata=ClassOrClassInstanceTypeMetadata(cls=RandomForestModel),
description=_(u'%s class or an instance of it.') %
RandomForestModel.__name__)
CopyArgumentMetadata(GLM.FitToArcGISTable, u'inputTable',
RandomForestModel.FitToArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'outputModelFile',
RandomForestModel.FitToArcGISTable, u'outputModelFile')
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'formula',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""Formula that specifies the table fields that are the response
variable and predictor variables of the model.
To fit a regression forest, the formula must be of the form:
response ~ predictor1 + predictor2 + ... + predictorN
where response and predictor1 ... predictorN are fields of the table.
To fit a classification forest, the formula must be of the form:
factor(response) ~ predictor1 + predictor2 + ... + predictorN
The use of the R factor() function on the response variable designates
it as a categorical variable and causes a classification forest to be
built for it.
Above, "response" must be a field name. It may not be an R expression.
This prohibits certain shortcuts sometimes available in R, such as
fitting a binary classification using a response expression such as
factor(X > 10). To do that, add a new field, set it to the result of X
> 10, and then use the new field as the response variable.
The field names are case sensitive. If any field used in the formula
is NULL for a given row, that row will not be used in fitting the
model.
For example, if you have a field Presence that indicates the
categorical presence or absence of a species (1 or 0) and you want to
model it in terms of sampled environmental covariates stored in the
SST, ChlDensity, and Depth fields, you would use the formula::
factor(Presence) ~ SST + ChlDensity + Depth
By default, all predictors are treated as continuous variables. To indicate
that a predictor should be treated as a categorical variable, use the
factor function. For example, if SubstrateType is an integer code that
should be treated as categorical::
factor(Presence) ~ SST + ChlDensity + Depth + factor(SubstrateType)
Additional syntax may be possible depending on which R package is used
to fit the model. Please see the documentation for the R packages for
details."""),
arcGISDisplayName=_(u'Formula'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'ntree',
typeMetadata=IntegerTypeMetadata(minValue=1),
description=_(
u"""Number of trees to grow. In the random forests literature, this is
referred to as the ntree parameter.
Larger number of trees produce more stable models and covariate
importance estimates, but require more memory and a longer run time.
For small datasets, 50 trees may be sufficient. For larger datasets,
500 or more may be required. Please consult the random forests
literature for extensive discussion of this parameter (e.g. Cutler et
al., 2007; Strobl et al., 2007; Strobl et al., 2008)."""),
arcGISDisplayName=_(u'Number of trees to grow'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'mtry',
typeMetadata=IntegerTypeMetadata(minValue=1, canBeNone=True),
description=_(
u"""Number of variables available for splitting at each tree node. In
the random forests literature, this is referred to as the mtry
parameter.
The default value of this parameter depends on which R package is used
to fit the model:
* randomForest - For classification models, the default is the square
root of the number of predictor variables (rounded down). For
regression models, it is the number of predictor variables divided
by 3 (rounded down).
* party - The default is always 5.
There is extensive discussion in the literature about the influence of
mtry. Cutler et al. (2007) reported that different values of mtry did
not affect the correct classification rates of their model and that
other performance metrics (sensitivity, specificity, kappa, and ROC
AUC) were stable under different values of mtry. On the other hand,
Strobl et al. (2008) reported that mtry had a strong influence on
predictor variable importance estimates.
Due to the conflicting evidence reported in the literature, we suggest
you start with the default value (i.e. leave this parameter blank) but
review the literature carefully and form your own opinion about what
value might be suitable for your specific model."""),
arcGISDisplayName=_(u'Number of variables available for splitting'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'rPackage',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'party',
u'randomForest']),
description=_(
u"""R package to use when fitting the model.
* randomForest - The randomForest package (Liaw and Wiener, 2002)
which implements Breiman's classic algorithm. This is the default.
* party - The party package, from which the cforest function (Hothorn
et al, 2006; Strobl et al, 2007; Strobl et al, 2008) will be used to
fit the model. This package provides better results in certain
situations, particularly in estimating predictor variable
importance, at the cost of requiring more processing time and much
more memory than randomForest. If the input table has thousands of
records or more, the cforest function may simply not have enough
memory to run. Also, this package does not produce the diagnostic
plots that randomForest produces.
For more information on the two packages, please see the
`randomForest package documentation
<http://cran.r-project.org/web/packages/randomForest/randomForest.pdf>`_
and the
`party package documentation
<http://cran.r-project.org/web/packages/party/party.pdf>`_.
"""),
arcGISDisplayName=_(u'R package to use'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'where',
RandomForestModel.FitToArcGISTable, u'where')
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'replace',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, individual trees of the forest are built using sampling
with replacement. If False, the default, the trees are built using
sampling without replacement.
In the random forests literature, this is referred to as the replace
parameter. Please see Strobl et al. (2007) for a discussion of its
effects. Although the classic randomForest package used a default
value of True for this parameter, we opted to use a default value of
False after reviewing the findings of Strobl et al."""),
arcGISDisplayName=_(u'Sample with replacement'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'cfMaxSurrogate',
typeMetadata=IntegerTypeMetadata(minValue=0, canBeNone=True),
description=_(
u"""Number of surrogate splits to evaluate. This parameter is only
used when the R party package is used to fit the model.
The default value is determined by the party package itself. At the
time of this writing it was 0, which meant that surrogate splits were
not evaluated by default.
If surrogate splits are evaluated, the model may use them to estimate
values for predictor variables that are missing data. Please see the
party package documentation for more information; also, Hapfelmeier
(2012) may be useful.
References
Hapfelmeier, A. (2012). Random Forest variable importance with missing
data. Technical Report Number 121, 2012. Department of Statistics,
University of Munich, Germany."""),
arcGISDisplayName=_(u'Number of surrogate splits to evaluate (party
package only)'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'seed',
typeMetadata=IntegerTypeMetadata(canBeNone=True),
description=_(
u"""Random number seed to use.
If a value is provided, it will be used to initialize R's random
number generator before the model is fitted. If a value is not
provided (the default), the random number generator will be
initialized from the current time.
This parameter is provided so you can precisely control the model
fitting process, if desired. Because random forests rely on random
selections of data, the default behavior is for a different forest to
be built every time you run the tool. To override this, you can
specify the seed for the random number generator. This will cause the
same exact sequence of random selections to be performed every time
you run the tool (assuming do not chang the input data or any other
parameters)."""),
arcGISDisplayName=_(u'Random number seed'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable, u'importance',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the default, the importance of each predictor variable
will be estimated and reported. If False, variable importance will not
be estimated.
If the R randomForest package is used to fit the model, two estimates
of importance will be reported, permutation accuracy and node
impurity, as described below. If the party package is used, only
permutation accuracy will be reported.
*Permutation Accuracy*
Permutation accuracy is the method that is most often recommended for
estimating variable importance in random forests. The basic idea is to
see how much worse the model performs when each predictor variable is
assigned random but realistic values and the rest of the variables are
left unchanged. The worse the model performs when a given predictor
variable is randomized, the more important that variable is in
predicting the response variable.
The estimate is computed as follows. First, for each tree in the
forest, the prediction error is calculated on the out-of-bag portion
of the data. Next, for each variable, the same calcuation is performed
using a random permutation of the values of that variable. Finally,
for each variable, the differences in prediction errors are averaged
over all trees.
For classification trees, the result is the mean decrease in
prediction accuracy (i.e. the mean descrease in the percentage of
observations classified correctly), reported on a 0 to 1 scale (with 1
representing 100%). In the Variable Importance table output by the
tool, this is the MeanDecreaseAccuracy column. When the R randomForest
package is used, this estimate is also reported for each class; these
estimates precede the MeanDecreaseAccuracy column.
For regression trees, the result is the percentage increase in mean
squared errors, reported on a 0 to 100 scale (with 100 representing
100%). In the Variable Importance table output by the tool, this is
the %IncMSE column.
In both cases, higher values indicate more important variables.
*Node Impurity*
This is an alternative method for estimating variables in random
forests. It is only provided by the randomForest package. Strobl et
al. (2007) report that this method is biased in certain ways and
recommend against it. Please see that paper for a detailed analysis
and description.
From the randomForest package documentation: this method reports the
total decrease in node impurities from splitting on each variable,
averaged over all trees. For classification trees, the result is the
mean decrease in the Gini index, and reported in the MeanDecreaseGini
column. For regression trees, the result is measured by the residual
sum of squares, and reported in the IncNodePurity column."""),
arcGISDisplayName=_(u'Estimate predictor variable importance'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable,
u'useScaledImportance',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""This parameter is used only when if the model is fitted with the R
randomForest package.
If True, the Permutation Accuracy estimates of predictor variable
importance are scaled by (divided by) the standard deviation of the
differences in prediction errors. If False, the default, the
Permutation Accuracy estimates are not scaled.
Strobl and Zeileis (2008) reported that a scaling approach can lead to
undesirable results. Following their advice, we recommend against
scaling.
References
Strobl, C. and Zeileis, A. (2008). Danger: High Power! - Exploring the
Statistical Properties of a Test for Random Forest Variable
Importance. Proceedings of the 18th International Conference on
Computational Statistics, Porto, Portugal."""),
arcGISDisplayName=_(u'Use scaled variable importance (randomForest
package only)'),
arcGISCategory=_(u'Model options'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable,
u'useConditionalImportance',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""This parameter is used only when if the model is fitted with the R
party package.
If True, conditional variable importance is computed. If False, the
default, the traditional permutation accuracy is computed, as is done
by the randomForest package.
A problem with the traditional permutation accuracy method is that it
does not account for autocorrelations between predictor variables.
When predictor variables are autocorrelated, the traditional method
exhibits a bias toward these variables and inflates their estimated
importances. A principal feature of the R party package is the ability
to estimate conditional variable importance and thereby reduce or
eliminate this bias. This method is described in detail by Strobl et
al. (2008).
We chose not to enable the conditional method by default because we
found it requires quite a lot of processing time and memory for large
datasets. We suggest you try first without using the conditional
method, then again with it enabled."""),
arcGISDisplayName=_(u'Use conditional variable importance (party package
only)'),
arcGISCategory=_(u'Model options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'xColumnName',
RandomForestModel.FitToArcGISTable, u'xColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'yColumnName',
RandomForestModel.FitToArcGISTable, u'yColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'zColumnName',
RandomForestModel.FitToArcGISTable, u'zColumnName')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'mColumnName',
RandomForestModel.FitToArcGISTable, u'mColumnName')
CopyArgumentMetadata(TreeModel.FitToArcGISTable, u'writeSummaryFile',
RandomForestModel.FitToArcGISTable, u'writeSummaryFile')
AddArgumentMetadata(RandomForestModel.FitToArcGISTable,
u'writeImportancePlot',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""This parameter is used only when if the model is fitted with the R
randomForest package.
If True, this tool will write a plot of the importance of predictor
variables estimated by permutation accuracy to a file having the name
X_importance.Y, where X is the name of the output model file minus the
extension and Y is the extension of the selected output plot format.
If the model is a classification model, the tool will also write a
plot of predictor variable importance for each class to a file having
the name X_importance_class_C.Y, where C is the name of the class.
These plots are just graphical depictions of the predictor importance
values reported in the tool's output. They do not contain any
information not already present in the tool's output.
At the time this tool was implemented, the R party package did not
provide a function for producing these plots. In the future, we may
adapt the code from the randomForest package to run on models fitted
with the party package."""),
arcGISDisplayName=_(u'Write predictor variable importance plot'),
arcGISCategory=_(u'Additional output options'))
AddArgumentMetadata(RandomForestModel.FitToArcGISTable,
u'writePartialDependencePlots',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""This parameter is used only when if the model is fitted with the R
randomForest package. This option is disabled by default because
creating these plots can take a long time.
If True, this tool will write a partial dependence plot for each term
in the fitted model's formula. Each plot gives a graphical depiction
of the marginal effect of a predictor variable on the class
probability (for classification models) or response (for regression
models). For classification models, a plot is generated for each class
and term combination.
Please see Cutler et al. (2007) for a detailed discussion of partial
dependence plots.
At the time this tool was implemented, the R party package did not
provide a function for producing these plots. In the future, we may
adapt the code from the randomForest package to run on models fitted
with the party package."""),
arcGISDisplayName=_(u'Write partial dependence plots'),
arcGISCategory=_(u'Additional output options'))
CopyArgumentMetadata(GLM.FitToArcGISTable, u'plotFileFormat',
RandomForestModel.FitToArcGISTable, u'plotFileFormat')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'res',
RandomForestModel.FitToArcGISTable, u'res')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'width',
RandomForestModel.FitToArcGISTable, u'width')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'height',
RandomForestModel.FitToArcGISTable, u'height')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'pointSize',
RandomForestModel.FitToArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'bg',
RandomForestModel.FitToArcGISTable, u'bg')
CopyArgumentMetadata(GLM.FitToArcGISTable, u'overwriteExisting',
RandomForestModel.FitToArcGISTable, u'overwriteExisting')
# Public method: RandomForestModel.PredictFromArcGISTable
AddMethodMetadata(RandomForestModel.PredictFromArcGISTable,
shortDescription=_(u'Given a random forest model, this tool predicts the
response variable for each row of a table.'),
longDescription=_(
u"""If a table is not provided, the prediction will be done on the
training data used to fit the model.
On completion, the tool outputs statistics that summarize how well the
model's predictions match the observed values of the response
variable, unless a table is provided that does not contain the
observed values of the response variable."""),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict Random Forest From Table'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Random Forest Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 9, 0),
RPackageDependency(u'ROCR'), RPackageDependency(u'e1071'),
RPackageDependency(u'caret')]) # e1071 is required by caret
CopyArgumentMetadata(RandomForestModel.FitToArcGISTable, u'cls',
RandomForestModel.PredictFromArcGISTable, u'cls')
AddArgumentMetadata(RandomForestModel.PredictFromArcGISTable,
u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by the Fit Random
Forest Model tool."""),
arcGISDisplayName=_(u'Fitted model file'))
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'inputTable',
RandomForestModel.PredictFromArcGISTable, u'inputTable')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'predictedValuesField',
RandomForestModel.PredictFromArcGISTable, u'predictedValuesField')
AddArgumentMetadata(RandomForestModel.PredictFromArcGISTable, u'cutoff',
typeMetadata=FloatTypeMetadata(minValue=0., maxValue=1., canBeNone=True),
description=_(
u"""Cutoff to use when classifying the continuous probability output
by a binary classification forest into an integer result (0 or 1).
Probabilities greater than or equal to the cutoff are classified as 1;
probabilities less than the cutoff are classified as 0.
This parameter should not be used for regression forests or
classification forests with more than two classes.
For binary classification forests, if a value is not provided, the
tool will automatically select the value that maximizes the value of
the Youden index (see Perkins and Schisterman, 2006), thereby
attempting to minimize the misclassification rate of the model. This
approach may not be optimal for your application; we encourage you to
review the extensive discusson of cutoffs in the scientific literature
and select a value deliberately.
References
Perkins NJ, Schisterman EF (2006) The Inconsistency of "Optimal"
Cutpoints Obtained using Two Criteria based on the Receiver Operating
Characteristic Curve. American Journal of Epidemiology 163:
670-675."""),
arcGISDisplayName=_(u'Binary classification cutoff'))
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'where',
RandomForestModel.PredictFromArcGISTable, u'where')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'ignoreOutOfRangeValues',
RandomForestModel.PredictFromArcGISTable, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'noDataValue',
RandomForestModel.PredictFromArcGISTable, u'noDataValue')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'outputPlotFile',
RandomForestModel.PredictFromArcGISTable, u'outputPlotFile')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'measure1',
RandomForestModel.PredictFromArcGISTable, u'measure1')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'measure2',
RandomForestModel.PredictFromArcGISTable, u'measure2')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'colorize',
RandomForestModel.PredictFromArcGISTable, u'colorize')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'outputSummaryFile',
RandomForestModel.PredictFromArcGISTable, u'outputSummaryFile')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'res',
RandomForestModel.PredictFromArcGISTable, u'res')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'width',
RandomForestModel.PredictFromArcGISTable, u'width')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'height',
RandomForestModel.PredictFromArcGISTable, u'height')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'pointSize',
RandomForestModel.PredictFromArcGISTable, u'pointSize')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'bg',
RandomForestModel.PredictFromArcGISTable, u'bg')
CopyArgumentMetadata(GLM.PredictFromArcGISTable, u'overwriteExisting',
RandomForestModel.PredictFromArcGISTable, u'overwriteExisting')
CopyResultMetadata(GLM.PredictFromArcGISTable, u'updatedTable',
RandomForestModel.PredictFromArcGISTable, u'updatedTable')
CopyResultMetadata(GLM.PredictFromArcGISTable, u'outputCutoff',
RandomForestModel.PredictFromArcGISTable, u'outputCutoff')
# Public method: RandomForestModel.PredictFromArcGISRasters
AddMethodMetadata(RandomForestModel.PredictFromArcGISRasters,
shortDescription=_(u'Using a fitted random forest model, this tool
creates a raster representing the response variable predicted from rasters
representing the predictor variables.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Predict Random Forest From Rasters'),
arcGISToolCategory=_(u'Statistics\\Model Data\\Random Forest Models'),
dependencies=[ArcGISDependency(9, 2), RDependency(2, 9, 0),
RPackageDependency(u'rgdal')])
CopyArgumentMetadata(RandomForestModel.FitToArcGISTable, u'cls',
RandomForestModel.PredictFromArcGISRasters, u'cls')
CopyArgumentMetadata(RandomForestModel.PredictFromArcGISTable,
u'inputModelFile', RandomForestModel.PredictFromArcGISRasters,
u'inputModelFile')
AddArgumentMetadata(RandomForestModel.PredictFromArcGISRasters,
u'outputResponseRaster',
typeMetadata=ArcGISRasterTypeMetadata(mustBeDifferentThanArguments=[u'inputModelFile'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""Output raster representing the predicted response.
For a classification model, this will be an integer raster with values
representing predicted classes. For a regression model, it will be a
floating-point raster containing predicted regression values.
The output raster will have the coordinate system, extent, and cell
size of the Template Raster. If a Template Raster is not specified,
the first Predictor Raster will be used as the template instead. The
prediction is performed for each cell of the template by extracting
the predictor values for that cell and processing them through the
fitted model.
Each predictor can either be obtained from a raster that gives the
values of it or assigned a constant value that is the same for all
cells. Accordingly, all predictors in the model must be listed under
the Raster Predictor Variables parameter of this tool or the Constant
Predictor Variables parameter (but not both).
For example, if your model used the formula::
Response ~ SST + Depth + DayOfYear
In this model, SST and Depth both vary spatially. These should be
listed under Raster Predictor Variables, and corresponding temperature
and depth rasters should be listed under Predictor Rasters.
On the other hand, DayOfYear does not vary spatially. Therefore it
should be listed under Constant Predictor Variables, and a value
should be given under Constant Values."""),
direction=u'Output',
arcGISDisplayName=_(u'Output response raster'))
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'templateRaster',
RandomForestModel.PredictFromArcGISRasters, u'templateRaster')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'rasterPredictorNames',
RandomForestModel.PredictFromArcGISRasters, u'rasterPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'predictorRasters',
RandomForestModel.PredictFromArcGISRasters, u'predictorRasters')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'constantPredictorNames',
RandomForestModel.PredictFromArcGISRasters, u'constantPredictorNames')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters,
u'constantPredictorValues', RandomForestModel.PredictFromArcGISRasters,
u'constantPredictorValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'cutoff',
RandomForestModel.PredictFromArcGISRasters, u'cutoff')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'resamplingTechniques',
RandomForestModel.PredictFromArcGISRasters, u'resamplingTechniques')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'ignoreOutOfRangeValues',
RandomForestModel.PredictFromArcGISRasters, u'ignoreOutOfRangeValues')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'buildPyramids',
RandomForestModel.PredictFromArcGISRasters, u'buildPyramids')
CopyArgumentMetadata(GLM.PredictFromArcGISRasters, u'overwriteExisting',
RandomForestModel.PredictFromArcGISRasters, u'overwriteExisting')
###############################################################################
# Metadata: ModelEvaluation class
###############################################################################
AddClassMetadata(ModelEvaluation,
shortDescription=_(u'Provides methods for evaluating fitted models.'),
isExposedAsCOMServer=True,
comIID=u'{D5DC8EA1-B51A-4B0F-A43D-F91C516CD8A1}',
comCLSID=u'{DD4042FC-A356-480E-9211-62B51C2113A7}')
# Public method: ModelEvaluation.PlotPerformanceOfBinaryClassificationModel
AddMethodMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
shortDescription=_(u'Plots the performance of a binary classification
model (a model where the response variable has two possible values) using the
R ROCR package.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=False,
#arcGISDisplayName=_(u'Plot Performance of Binary Classification Model'),
#arcGISToolCategory=_(u'Statistics\\Model Data\\Evaluate Model
Performance'),
dependencies=[RDependency(2, 6, 0), RPackageDependency(u'ROCR')])
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'cls',
typeMetadata=ClassOrClassInstanceTypeMetadata(cls=ModelEvaluation),
description=_(u'%s class or an instance of it.') %
ModelEvaluation.__name__)
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'inputModelFile',
typeMetadata=FileTypeMetadata(mustExist=True),
description=_(
u"""File that contains the fitted model, generated by one of the
fitting tools (e.g. Fit GLM).
The model must conform to two constraints:
* It must be a binary classification model, i.e. there must be only
two possible values of the response variable in the data used to
build the model.
* The larger response value must represent the "positive" result, and
the smaller the "negative" result.
These constraints would be satisfied, for example, by a species
presence/absence model where the response variable was either 1
(species present) or 0 (species not present). It would not be
satisfied from a species density model where the response variable was
the number of species per unit area. Nor would it be satisfied if the
response variable values were switched, such that 1 represented
absence and 0 represented presence.
If the first constraint is violated, you will receve the error::
ROCR currently supports only evaluation of binary classification tasks.
If the second constraint is violated, you may not receive an error.
Instead, the output plot and statistics will be wrong. Take care to
avoid violating this constraint."""),
arcGISDisplayName=_(u'Input model file'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'measure1',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'acc', u'cal',
u'chisq', u'ecost', u'err', u'f', u'fall', u'fnr', u'fpr', u'lift', u'mat',
u'mi', u'miss', u'npv', u'odds', u'ppv', u'pcfall', u'pcmiss', u'phi',
u'prec', u'rch', u'rec', u'rnp', u'rpp', u'sar', u'sens', u'spec', u'tnr',
u'tpr']),
description=_(
u"""The first performance measure to plot.
This measure will serve as the Y coordinate for the plot. If a second
measure is specified, it will serve as the X coordinate. Otherwise the
model cutoff will serve as the X coordinate.
When modeling a binary response, an important task is selecting a
cutoff. The cutoff is the value used to determine whether a given
predicted value of the response variable should be classified as
positive or negative. Because the model typically outputs predictions
along a continual range (e.g. between 0.0 and 1.0), you must determine
the portions of the range that represent positive and negative
responses. Response values less than the cutoff are classified as
negative and those greater than or equal to the cutoff are classified
as positive.
Plots that use the cutoff as the X coordinate show you a measure of
the model's performance for the range of cutoff values. You can use
this this plot to select the cutoff value that maximizes, minimizes,
or otherwise optimizes a given performance measure. For example, if
you selected "acc" (accuracy) as the performance measure, you could
find the cutoff value that maximized the model's accuracy by finding
the highest point on the plot and then looking up the cutoff value on
the X axis.
Plots that use a second performance measure as the X coordinate allow
you to balance one measure of the model's performance against another.
The plot will be color-coded by cutoff value, allowing you to look up
the cutoff for a given combination of the two performance measures.
Selecting an optimal cutoff will often involve making a tradeoff
between the two measures.
For example, to create a classic receiver operating characteristic
(ROC) curve, select "tpr" (true positive rate) for the first measure
and "fpr" (false positive rate) for the second measure.
For the first performance measure, you can select from the following
list. This list is taken from the documentation of the R ROCR package
that implements the plots. Some of them do not allow selection of a
second performance measure (you must omit it when invoking this tool),
as noted in the measure's description.
In these descriptions, Y and Yhat are random variables representing
the class and the prediction for a randomly drawn sample,
respectively. + and - are the positive and negative class,
respectively. The following abbreviations are used for for empirical
quantities: P (# positive samples), N (# negative samples), TP (# true
positives), TN (# true negatives), FP (# false positives), FN (# false
negatives).
* acc - Accuracy. P(Yhat = Y). Estimated as: (TP+TN)/(P+N).
* cal - Calibration error. The calibration error is the absolute
difference between predicted confidence and actual reliability. This
error is estimated at all cutoffs by sliding a window of size 100
across the range of possible cutoffs. E.g., if for several positive
samples the output of the classifier is around 0.75, you might
expect from a well-calibrated classifier that the fraction of them
which is correctly predicted as positive is also around 0.75. In a
well-calibrated classifier, the probabilistic confidence estimates
are realistic. Only for use with probabilistic output (i.e. scores
between 0 and 1; some of the other measures actually support values
between -1 and 1).
* chisq - Chi square test statistic. Note that R might raise a warning
if the sample size is too small.
* ecost - Expected cost. For details on cost curves, cf. Drummond &
Holte 2000, 2004. ecost has an obligatory x axis, the so-called
'probability-cost function'; thus you may not specify a second
performance measure.
* err - Error rate. P(Yhat != Y). Estimated as: (FP+FN)/(P+N).
* f - Precision-recall F measure (van Rijsbergen, 1979). Weighted
harmonic mean of precision (P) and recall (R). F = 1/ (alpha*1/P +
(1-alpha)*1/R). For this tool, alpha is always 1/2, so the mean is
balanced.
* fall - Fallout. Same as fpr.
* fnr - False negative rate. P(Yhat = - | Y = +). Estimated as: FN/P.
* fpr - False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
* lift - Lift value. P(Yhat = + | Y = +)/P(Yhat = +).
* mat - Matthews correlation coefficient. Same as phi.
* mi - Mutual information. I(Yhat, Y) := H(Y) - H(Y | Yhat), where H
is the (conditional) entropy. Entropies are estimated naively (no
bias correction).
* miss - Miss. Same as fnr.
* npv - Negative predictive value. P(Y = - | Yhat = -). Estimated as:
TN/(TN+FN).
* odds - Odds ratio. (TP*TN)/(FN*FP). Note that odds ratio produces
Inf or NA values for all cutoffs corresponding to FN=0 or FP=0. This
can substantially decrease the plotted cutoff region.
* pcfall - Prediction-conditioned fallout. P(Y = - | Yhat = +).
Estimated as: FP/(TP+FP).
* pcmiss - Prediction-conditioned miss. P(Y = + | Yhat = -). Estimated
as: FN/(TN+FN).
* phi - Phi correlation coefficient. (TP*TN -
FP*FN)/(sqrt((TP+FN)*(TN+FP)*(TP+FP)*(TN+FN))). Yields a number
between -1 and 1, with 1 indicating a perfect prediction, 0 indicating
a random prediction. Values below 0 indicate a worse than random
prediction.
* ppv - Positive predictive value. P(Y = + | Yhat = +). Estimated as:
TP/(TP+FP).
* prec - Precision. Same as ppv.
* rch - ROC convex hull. A ROC (=tpr vs fpr) curve with concavities
(which represent suboptimal choices of cutoff) removed (Fawcett
2001). Since the result is already a parametric performance curve,
it cannot be used in combination with other measures (thus you may
not specify a second performance measure).
* rec - Recall. Same as tpr.
* rnp - Rate of negative predictions. P(Yhat = -). Estimated as:
(TN+FN)/(TP+FP+TN+FN).
* rpp - Rate of positive predictions. P(Yhat = +). Estimated as:
(TP+FP)/(TP+FP+TN+FN).
* sar - Score combinining performance measures of different
characteristics, in the attempt of creating a more "robust" measure
(cf. Caruana R., ROCAI2004): SAR = 1/3 * (Accuracy + Area under the
ROC curve + Root mean-squared error).
* sens - Sensitivity. Same as tpr.
* spec - Specificity. Same as tnr.
* tnr - True negative rate. P(Yhat = - | Y = -). Estimated as: TN/N.
* tpr - True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
"""),
arcGISDisplayName=_(u'First performance measure to plot'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'measure2',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True,
allowedValues=[u'acc', u'cal', u'chisq', u'err', u'f', u'fall', u'fnr',
u'fpr', u'lift', u'mat', u'mi', u'miss', u'npv', u'odds', u'ppv', u'pcfall',
u'pcmiss', u'phi', u'prec', u'rec', u'rnp', u'rpp', u'sar', u'sens', u'spec',
u'tnr', u'tpr']),
description=_(
u"""The second performance measure to plot.
If specified, this performance measure will be used as the X
coordinate of the plot instead of the model cutoff value. Please see
the documentation for the First Performance Measure for more
information."""),
arcGISDisplayName=_(u'Second performance measure to plot'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'summaryStats',
typeMetadata=ListTypeMetadata(elementType=UnicodeStringTypeMetadata(allowedValues=[u'auc',
u'mxe', u'prbe', u'rmse']), canBeNone=True),
description=_(
u"""List of summary statistics to calculate for the model.
The summary statistics are logged as information messages. If you
invoke this tool from ArcGIS, you will see them in the geoprocessing
output window. If you invoke this tool programmatically, the summary
statistics are returned in a list as well.
The available statistics are:
* auc - Area under the ROC curve. This is equal to the value of the
Wilcoxon-Mann-Whitney test statistic and also the probability that
the classifier will score are randomly drawn positive sample higher
than a randomly drawn negative sample.
* mxe - Mean cross-entropy. Only for use with probabilistic response
variables (i.e. scores between 0 and 1). MXE := - 1/(P+N)
sum_{y_i=+} ln(yhat_i) + sum_{y_i=-} ln(1-yhat_i).
* prbe - Precision-recall break-even point. The cutoff where
precision and recall are equal. At this point, positive and negative
predictions are made at the same rate as their prevalence in the
data. In the event that there are multiple break-even points, the
cutoff for the last one will be returned.
* rmse - Root-mean-squared error. Only for use with numerical class
labels. RMSE := sqrt(1/(P+N) sum_i (y_i - yhat_i)^2).
The summary statistics are calculated by the R ROCR package."""),
arcGISDisplayName=_(u'Summary statistics to calculate'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'title',
typeMetadata=UnicodeStringTypeMetadata(canBeNone=True),
description=_(
u"""Title for the plot. If no title is provided, the plot will not
have a title."""),
arcGISDisplayName=_(u'Plot title'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'evaluationDataTable',
typeMetadata=ArcGISTableViewTypeMetadata(canBeNone=True, mustExist=True),
description=_(
u"""ArcGIS table, table view, feature class, or feature layer
containing the data for evaluating the model. If an evaluation data
table is not provided, the model will be evaluated using the data that
was used to fit the model. (A copy of this data exists in the input
model file.)
The evaluation data table must have the same fields as the table that
was used to fit the model. The model will be evaluated by reading
those values and computing the predicted response for each row and
comparing it to the actual response.
When doing statistical modeling, it is considered good practice to
split your input data into a training data set and and an evaluation
data set, fit the model using the training data, and evaluate the
model using the evaluation data. Please consult the scientific
literature for advice on this procedure. The following article is a
good place to start.
Guisan, A., and Zimmerman, N.E. 2000. Predictive habitat distribution
models in ecology. Ecological Modeling 135: 147-186."""),
arcGISDisplayName=_(u'Evaluation data table'),
arcGISCategory=_(u'Evaluation data options'),
dependencies=[ArcGISDependency(9, 1)])
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'where',
typeMetadata=SQLWhereClauseTypeMetadata(canBeNone=True),
description=ArcGIS91SelectCursor.__init__.__doc__.Obj.GetArgumentByName(u'where').Description,
arcGISParameterDependencies=[u'evaluationDataTable'],
arcGISDisplayName=_(u'Where clause'),
arcGISCategory=_(u'Evaluation data options'),
dependencies=[ArcGISDependency(9, 1)])
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'outputPlotFile',
typeMetadata=FileTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'inputModelFile'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""File to create for the plot. If this parameter is specified, the
plot will be written to the file rather than displayed on the screen.
The file must have one of the following two extensions, which
determines the format that will be used:
* .emf - Windows enhanced metafile (EMF) format. This is a vector
format that may be printed and resized without any pixelation and is
therefore suitable for use in printable documents that recognize
this format (e.g. Microsoft Word or Microsoft Visio).
* .png - Portable network graphics (PNG) format. This is a compressed,
lossless, highly portable raster format suitable for use in web
pages or other locations where a raster format is desired. Most
scientific journals accept PNG; they typically request that files
have a resolution of at least 1000 DPI.
"""),
direction=u'Output',
arcGISDisplayName=_(u'Output file'),
arcGISCategory=_(u'Output file options'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'res',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""PNG plot file resolution, in dots per inch (DPI). The default is
set to a high value (1000) because this is the minimum resolution
typically required by scientific journals that accept figures in PNG
format.
This parameter is ignored for EMF format because it is a vector
format."""),
arcGISDisplayName=_(u'Plot resolution, in DPI'),
arcGISCategory=_(u'Output file options'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'width',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""Plot file width in thousandths of inches (for EMF format; e.g. the
value 3000 is 3 inches) or pixels (for PNG format)."""),
arcGISDisplayName=_(u'Plot width'),
arcGISCategory=_(u'Output file options'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'height',
typeMetadata=FloatTypeMetadata(mustBeGreaterThan=0.),
description=_(
u"""Plot file height in thousandths of inches (for EMF format; e.g. the
value 3000 is 3 inches) or pixels (for PNG format)."""),
arcGISDisplayName=_(u'Plot height'),
arcGISCategory=_(u'Output file options'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'pointSize',
typeMetadata=FloatTypeMetadata(minValue=1.0),
description=_(
u"""The default pointsize of plotted text."""),
arcGISDisplayName=_(u'Default pointsize of plotted text'),
arcGISCategory=_(u'Output file options'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'bg',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""PNG plot file background color. The color must be a valid name in
R's color palette, or "transparent" if there is no background color.
This parameter is ignored if the plot format file is EMF."""),
arcGISDisplayName=_(u'Plot background color'),
arcGISCategory=_(u'Output file options'))
AddArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'overwriteExisting',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the output file will be overwritten, if it exists. If
False, a ValueError will be raised if the output file exists."""),
initializeToArcGISGeoprocessorVariable=u'OverwriteOutput')
AddResultMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'summaryStatValues',
typeMetadata=ListTypeMetadata(elementType=FloatTypeMetadata(),
canBeNone=True),
description=_(
u"""The calculated summary statistics. This will be the same length as
the summaryStats parameter. If summaryStats was None, this will be
None."""))
# Public method: ModelEvaluation.PlotROCOfBinaryClassificationModel
AddMethodMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel,
shortDescription=_(u'Plots the receiver operating characteristic (ROC)
curve of a binary classification model (a model where the response variable
has two possible values) using the R ROCR package.'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=False,
#arcGISDisplayName=_(u'Plot ROC of Binary Classification Model'),
#arcGISToolCategory=_(u'Statistics\\Model Data\\Evaluate Model
Performance'),
dependencies=[RDependency(2, 6, 0), RPackageDependency(u'ROCR')])
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'cls', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'cls')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'inputModelFile', ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'inputModelFile')
AddArgumentMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'cutoff',
typeMetadata=UnicodeStringTypeMetadata(allowedValues=[u'None',
u'Automatic', u'Specified'], makeLowercase=True),
description=_(
u"""Cutoff to display in the plot and to use for the calculation of
summary statistics. The possible values are:
* None - the plot will not display a cutoff and summary statistics
will not be calculated.
* Automatic - the tool will automatically select the cutoff value for
the point on the curve that is closest to the point of perfect
classification (the upper-left corner of the plot).
* Specified - the tool will use the cutoff value that you specify for
the Cutoff Value parameter.
When doing binary classification, an important task is selecting a
cutoff. The cutoff is the value used to determine whether a given
predicted value of the response variable should be classified as
positive or negative. Because the model typically outputs predictions
along a continual range (e.g. between 0.0 and 1.0), you must determine
the portions of the range that represent positive and negative
responses. Response values less than the cutoff are classified as
negative and those greater than or equal to the cutoff are classified
as positive.
In ROC analysis, you choose the cutoff value by contemplating the
tradeoff you will obtain between the true positive rate and false
positive rate of the model. A perfect model has a true positive rate
of 1 and a false positive rate of 0. This corresponds to the
upper-left corner of the ROC plot. Depending on your scenario, one
rate may be more important than another. In this case, you should run
this tool once with Cutoff set to Automatic, inspect the plot, choose
a cutoff that meets your goals, and run the tool a second time with
Cutoff set to Specified. If one rate is not more important than the
other, you might be satisfied with the Automatic option. When in
doubt, consult a statistician."""),
arcGISDisplayName=_(u'Cutoff'))
AddArgumentMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'cutoffValue',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Cutoff to value to use when the Cutoff parameter is set to
Specifed. Please see the documentation for that parameter for more
information."""),
arcGISDisplayName=_(u'Cutoff value'))
AddArgumentMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'colorize',
typeMetadata=BooleanTypeMetadata(),
description=_(
u"""If True, the ROC curve will be colorized by cutoff value. If
False, the ROC curve will be black."""),
arcGISDisplayName=_(u'Colorize ROC curve'))
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'title', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'title')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'evaluationDataTable', ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'evaluationDataTable')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'where', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'where')
AddArgumentMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'outputSummaryFile',
typeMetadata=FileTypeMetadata(canBeNone=True,
mustBeDifferentThanArguments=[u'inputModelFile', u'outputPlotFile'],
deleteIfParameterIsTrue=u'overwriteExisting', createParentDirectories=True),
description=_(
u"""Text file to receive the model summary statistics that this tool
reports as log messages. If a cutoff is provided or calculated by this
tool, a contingency table and associated statistics will also be
written."""),
direction=u'Output',
arcGISDisplayName=_(u'Model summary statistics file'),
arcGISCategory=_(u'Output file options'))
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'outputPlotFile', ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'outputPlotFile')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'res', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'res')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'width', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'width')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'height', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'height')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'pointSize', ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'pointSize')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'bg', ModelEvaluation.PlotROCOfBinaryClassificationModel, u'bg')
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'overwriteExisting', ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'overwriteExisting')
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel,
u'outputCutoff',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Output cutoff value. The returned value depends on what was
specified for the Cutoff parameter:
* None - no value is returned (in Python, None is returned, often
called NULL in other programming languages).
* Automatic - the automatically calculated value is returned.
* Specified - the value specified for the Cutoff Value parameter is
returned.
"""),
arcGISDisplayName=_(u'Output cutoff'))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'tp',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""The number of true positives obtained for the cutoff value that
was used. If no cutoff was used, no value is returned (in Python, None
is returned, often called NULL in other programming languages)."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'fp',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""The number of false positives obtained for the cutoff value that
was used. If no cutoff was used, no value is returned (in Python, None
is returned, often called NULL in other programming languages)."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'fn',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""The number of false negatives obtained for the cutoff value that
was used. If no cutoff was used, no value is returned (in Python, None
is returned, often called NULL in other programming languages)."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'tn',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""The number of true negatives obtained for the cutoff value that
was used. If no cutoff was used, no value is returned (in Python, None
is returned, often called NULL in other programming languages)."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'auc',
typeMetadata=FloatTypeMetadata(),
description=_(u"""The area under the ROC curve."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'mxe',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Mean cross-entropy for the model, if the model is probabilistic
(the respose ranges from 0 to 1), or None otherwise. MXE := - 1/(P+N)
sum_{y_i=+} ln(yhat_i) + sum_{y_i=-} ln(1-yhat_i)."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'prbe',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Precision-recall break-even point for the model, which is the
cutoff where precision and recall are equal. At this point, positive
and negative predictions are made at the same rate as their prevalence
in the data. In the event that there are multiple break-even points,
the cutoff for the last one will be returned. If there is no such
point, None will be returned. (I am not actually sure this is
possible)."""))
AddResultMetadata(ModelEvaluation.PlotROCOfBinaryClassificationModel, u'rmse',
typeMetadata=FloatTypeMetadata(canBeNone=True),
description=_(
u"""Root-mean-squared error, if the response variable is a numeric
value (rather than a categorical string), or None otherwise. RMSE :=
sqrt(1/(P+N) sum_i (y_i - yhat_i)^2)."""))
# Public method:
ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords
AddMethodMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
shortDescription=_(u'Randomly designates the records of a table as either
training records (for fitting a predictive model) or test records (for
evaluating the model).'),
isExposedToPythonCallers=True,
isExposedByCOM=True,
isExposedAsArcGISTool=True,
arcGISDisplayName=_(u'Randomly Split Table Into Training and Test
Records'),
arcGISToolCategory=_(u'Statistics\\Model Data'),
dependencies=[ArcGISDependency(9, 1)])
CopyArgumentMetadata(ModelEvaluation.PlotPerformanceOfBinaryClassificationModel,
u'cls', ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'cls')
AddArgumentMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'table',
typeMetadata=ArcGISTableViewTypeMetadata(mustExist=True),
description=_(
u"""ArcGIS table, table view, feature class, or feature layer."""),
arcGISDisplayName=_(u'Table'))
AddArgumentMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'percentTest',
typeMetadata=FloatTypeMetadata(minValue=0.0, maxValue=100.0),
description=_(
u"""Percent of the table's records to randomly designate test records.
The remaining records will be designated training records.
A popular ratio of training to test is 2 to 1, but a different ratio
may be appropriate depending on the total number of records and the
objective of your study. Please consult the scientific literature for
advice. The following article is a good place to start.
Guisan, A., and Zimmerman, N.E. 2000. Predictive habitat distribution
models in ecology. Ecological Modeling 135: 147-186."""),
arcGISDisplayName=_(u'Percentage of test records'))
AddArgumentMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'field',
typeMetadata=UnicodeStringTypeMetadata(),
description=_(
u"""Field to be randomly assigned the value 0 for training records and
1 for test records.
If the field already exists, it must have the SHORT, LONG, FLOAT, or
DOUBLE data type. If it does not exist, it will be created with the
SHORT data type. If you do not provide a field name, the name
TestData will be used."""),
arcGISDisplayName=_(u'Field to assign'))
AddArgumentMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'where',
typeMetadata=SQLWhereClauseTypeMetadata(canBeNone=True),
description=ArcGIS91SelectCursor.__init__.__doc__.Obj.GetArgumentByName(u'where').Description,
arcGISParameterDependencies=[u'table'],
arcGISDisplayName=_(u'Where clause'))
AddArgumentMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'seed',
typeMetadata=IntegerTypeMetadata(canBeNone=True),
description=_(
u"""Random number seed to use.
This parameter is optional. If a value is provided, it will be used to
initialize the random number generator before the records are randomly
classified as training or test records. Use this option when it is
necessary to reproduce a given classification of records every time
your run the tool.
If you do not specify a seed value, the random number generator will
be initialized from the current time. This will guarantee that the
tool classifies the records differently every time you run it."""),
arcGISDisplayName=_(u'Random number seed'))
AddResultMetadata(ModelEvaluation.RandomlySplitArcGISTableIntoTrainingAndTestRecords,
u'updatedTable',
typeMetadata=ArcGISTableViewTypeMetadata(),
description=_(u'Updated table.'),
arcGISDisplayName=_(u'Updated table'),
arcGISParameterDependencies=[u'table'])
###############################################################################
# Names exported by this module
###############################################################################
__all__ = ['GLM',
'GAM',
'TreeModel',
'LinearMixedModel',
'RandomForestModel',
'ModelEvaluation']