function tripal_feature_gff3_load_form

2.x tripal_feature.gff_loader.inc tripal_feature_gff3_load_form()
1.x gff_loader.inc tripal_feature_gff3_load_form()

The form to submit a GFF3 loading job

Related topics

1 string reference to 'tripal_feature_gff3_load_form'
tripal_feature_menu in tripal_feature/tripal_feature.module
Implements hook_menu().

File

tripal_feature/includes/tripal_feature.gff_loader.inc, line 21
Provides gff3 loading functionality. Creates features based on their specification in a GFF3 file.

Code

function tripal_feature_gff3_load_form() {

  $form['gff_file'] = array(
    '#type' => 'textfield',
    '#title' => t('GFF3 File'),
    '#description' => t('Please enter the full system path for the GFF file, or a path within the Drupal
                           installation (e.g. /sites/default/files/xyz.gff).  The path must be accessible to the
                           server on which this Drupal instance is running.'),
    '#required' => TRUE,
  );
  // get the list of organisms
  $sql = "SELECT * FROM {organism} ORDER BY genus, species";
  $org_rset = chado_query($sql);
  $organisms = array();
  $organisms[''] = '';
  while ($organism = $org_rset->fetchObject()) {
    $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
  }
  $form['organism_id'] = array(
    '#title' => t('Organism'),
    '#type' => t('select'),
    '#description' => t("Choose the organism to which these sequences are associated"),
    '#required' => TRUE,
    '#options' => $organisms,
  );

  // get the list of analyses
  $sql = "SELECT * FROM {analysis} ORDER BY name";
  $org_rset = chado_query($sql);
  $analyses = array();
  $analyses[''] = '';
  while ($analysis = $org_rset->fetchObject()) {
    $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
  }
  $form['analysis_id'] = array(
    '#title' => t('Analysis'),
    '#type' => t('select'),
    '#description' => t("Choose the analysis to which these features are associated.
       Why specify an analysis for a data load?  All data comes
       from some place, even if downloaded from Genbank. By specifying
       analysis details for all data imports it allows an end user to reproduce the
       data set, but at least indicates the source of the data."),
    '#required' => TRUE,
    '#options' => $analyses,
  );

  $form['line_number'] = array(
    '#type' => 'textfield',
    '#title' => t('Start Line Number'),
    '#description' => t('Enter the line number in the GFF file where you would like to begin processing.  The
      first line is line number 1.  This option is useful for examining loading problems with large GFF files.'),
    '#size' => 10,
  );

  $form['landmark_type'] = array(
    '#title' => t('Landmark Type'),
    '#type' => t('textfield'),
    '#description' => t("Optional. Use this field to specify a Sequence Ontology type
       for the landmark sequences in the GFF fie (e.g. 'chromosome'). If the GFF file
       contains a '##sequence-region' line that describes the landmark sequences to
       which all others are aligned and a type is provided here then the features
       will be created if they do not already exist.  If they do exist then this
       field is not used."),
  );

  $form['alt_id_attr'] = array(
    '#title' => t('ID Attribute'),
    '#type' => t('textfield'),
    '#description' => t("Optional. Sometimes lines in the GFF file are missing the
      required ID attribute that specifies the unique name of the feature, but there
      may be another attribute that can uniquely identify the feature.  If so,
      you may specify the name of the attribute to use for the name."),
  );

  // Advanced Options
  $form['advanced'] = array(
    '#type' => 'fieldset',
    '#title' => t('Advanced Options'),
    '#collapsible' => TRUE,
    '#collapsed' => TRUE,
  );

  $form['advanced']['protein_names'] = array(
    '#type' => 'fieldset',
    '#title' => t('Protein Names'),
    '#collapsible' => TRUE,
    '#collapsed' => FALSE,
    '#weight' => 5,
  );

  $form['advanced']['protein_names']['re_help'] = array(
    '#type' => 'item',
    '#markup' => t('A regular expression is an advanced method for extracting information from a string of text.
                   If your GFF3 file does not contain polypeptide (or protein) features, but contains CDS features, proteins will be automatically created.
                   By default the loader will give each protein a name based on the name of the corresponding mRNA followed by the "-protein" suffix.
                   If you want to customize the name of the created protein, you can use the following regex.')
  );
  $form['advanced']['protein_names']['re_mrna'] = array(
    '#type' => 'textfield',
    '#title' => t('Regular expression for the mRNA name'),
    '#required' => FALSE,
    '#description' => t('Enter the regular expression that will extract portions of
       the mRNA unique name. For example, for a
       mRNA with a unique name finishing by -RX (e.g. SPECIES0000001-RA),
       the regular expression would be, "^(.*?)-R([A-Z]+)$".')
  );
  $form['advanced']['protein_names']['re_protein'] = array(
    '#type' => 'textfield',
    '#title' => t('Replacement string for the protein name'),
    '#required' => FALSE,
    '#description' => t('Enter the replacement string that will be used to create
       the protein name based on the mRNA regular expression. For example, for a
       mRNA regular expression "^(.*?)-R()[A-Z]+)$", the corresponding protein regular
       expression would be "$1-P$2".')
  );

  $form['advanced']['import_options'] = array(
    '#type' => 'fieldset',
    '#title' => t('Import Options'),
    '#collapsible' => TRUE,
    '#collapsed' => FALSE,
    '#weight' => 0,
  );

  $form['advanced']['import_options']['use_transaction'] = array(
    '#type' => 'checkbox',
    '#title' => t('Use a transaction'),
    '#required' => FALSE,
    '#description' => t('Use a database transaction when loading the GFF file.  If an error occurs
      the entire datset loaded prior to the failure will be rolled back and will not be available
      in the database.  If this option is unchecked and failure occurs all records up to the point
      of failure will be present in the database.'),
    '#default_value' => 1,
  );
  $form['advanced']['import_options']['add_only'] = array(
    '#type' => 'checkbox',
    '#title' => t('Import only new features'),
    '#required' => FALSE,
    '#description' => t('The job will skip features in the GFF file that already
                         exist in the database and import only new features.'),
  );
  $form['advanced']['import_options']['update'] = array(
    '#type' => 'checkbox',
    '#title' => t('Import all and update'),
    '#required' => FALSE,
    '#default_value' => 'checked',
    '#description' => t('Existing features will be updated and new features will be added.  Attributes
                         for a feature that are not present in the GFF but which are present in the
                         database will not be altered.'),
    '#default_value' => 1,
  );
  // SPF: there are bugs in refreshing and removing features.  The bugs arise
  //      if a feature in the GFF does not have a uniquename. GenSAS will auto
  //      generate this uniquename and it will not be the same as a previous
  //      load because it uses the date.  This causes orphaned CDS/exons, UTRs
  //      to be left behind during a delete or refresh.  So, the short term
  //      fix is to remove these options.
  //   $form['import_options']['refresh']= array(
  //     '#type' => 'checkbox',
  //     '#title' => t('Import all and replace'),
  //     '#required' => FALSE,
  //     '#description' => t('Existing features will be updated and feature properties not
  //                          present in the GFF file will be removed.'),
  //   );
  //   $form['import_options']['remove']= array(
  //     '#type' => 'checkbox',
  //     '#title' => t('Delete features'),
  //     '#required' => FALSE,
  //     '#description' => t('Features present in the GFF file that exist in the database
  //                          will be removed rather than imported'),
  //   );
  $form['advanced']['import_options']['create_organism'] = array(
    '#type' => 'checkbox',
    '#title' => t('Create organism'),
    '#required' => FALSE,
    '#description' => t('The Tripal GFF loader supports the "organism" attribute. This allows features of a
       different organism to be aligned to the landmark sequence of another species.  The format of the
       attribute is "organism=[genus]:[species]", where [genus] is the organism\'s genus and [species] is the
       species name. Check this box to automatically add the organism to the database if it does not already exists.
       Otherwise lines with an oraganism attribute where the organism is not present in the database will be skipped.'),
  );

  $form['advanced']['targets'] = array(
    '#type' => 'fieldset',
    '#title' => t('Targets'),
    '#collapsible' => TRUE,
    '#collapsed' => FALSE,
    '#weight' => 1,
  );
  $form['advanced']['targets']['adesc'] = array(
    '#markup' => t("When alignments are represented in the GFF file (e.g. such as
       alignments of cDNA sequences to a whole genome, or blast matches), they are
       represented using two feature types: 'match' (or cDNA_match, EST_match, etc.)
       and 'match_part'.  These features may also have a 'Target' attribute to
       specify the sequence that is being aligned.
       However, the organism to which the aligned sequence belongs may not be present in the
       GFF file.  Here you can specify the organism and feature type of the target sequences.
       The options here will apply to all targets unless the organism and type are explicity
       set in the GFF file using the 'target_organism' and 'target_type' attributes."),
  );
  $form['advanced']['targets']['target_organism_id'] = array(
    '#title' => t('Target Organism'),
    '#type' => t('select'),
    '#description' => t("Optional. Choose the organism to which target sequences belong.
      Select this only if target sequences belong to a different organism than the
      one specified above. And only choose an organism here if all of the target sequences
      belong to the same species.  If the targets in the GFF file belong to multiple
      different species then the organism must be specified using the 'target_organism=genus:species'
      attribute in the GFF file."),
    '#options' => $organisms,
  );
  $form['advanced']['targets']['target_type'] = array(
    '#title' => t('Target Type'),
    '#type' => t('textfield'),
    '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein
       and an mRNA have the same name) then you must specify the type for all targets in the GFF file. If
       the targets are of different types then the type must be specified using the 'target_type=type' attribute
       in the GFF file. This must be a valid Sequence Ontology (SO) term."),
  );
  $form['advanced']['targets']['create_target'] = array(
    '#type' => 'checkbox',
    '#title' => t('Create Target'),
    '#required' => FALSE,
    '#description' => t("If the target feature cannot be found, create one using the organism and type specified above, or
       using the 'target_organism' and 'target_type' fields specified in the GFF file.  Values specified in the
       GFF file take precedence over those specified above."),
  );

  $form['button'] = array(
    '#type' => 'submit',
    '#value' => t('Import GFF3 file'),
    '#weight' => 10,
  );

  return $form;
}