ad_replicate_web_robots_db dbWhat it does:
Replicates data from the Web Robots Database (http://info.webcrawler.com/mak/projects/robots/active.html) into a table in the ACS database. The data is published on the Web as a flat file, whose format is specified in http://info.webcrawler.com/mak/projects/robots/active/schema.txt. Basically, each non-blank line of the database corresponds to one field (name-value pair) of a record that defines the characteristics of a registered robot. Each record has a "robot-id" field as a unique identifier. (There are many fields in the schema, but, for now, the only ones we care about are: robot-id, robot-name, robot-details-url, and robot-useragent.)\nDefined in: /web/philip/tcl/ad-robot-defs.tclReturns the number of rows replicated. May raise a Tcl error that should be caught by the caller.
Source code:
set web_robots_db_url [ad_parameter WebRobotsDB robot-detection]
set result [ns_geturl $web_robots_db_url headers]
set page [split $result "\n"]
# A set in which to store the fields of a record as we
# process the file.
set robot [ns_set create]
set robot_count 0
foreach line $page {
# A "robot-id" line delimits a new record, so each
# time we encounter one, we need to write the prior
# record (if there is one) into the database. There
# is only case in which there will *not* be a prior
# record, i.e., for the very first record.
#
if [regexp "robot-id: *(.+)" $line match robot_id] {
set prior_robot_id [ns_set get $robot "robot_id"]
if ![empty_string_p $prior_robot_id] {
# As long as there is an actual value for
# "robot_useragent", load the record, i.e.,
# update it if a record with the same
# robot_id already exists or insert it if
# one does not. (There's no point in keeping
# info about robots that we can't identify.)
#
if ![empty_string_p [ns_set get $robot "robot_useragent"]] {
if [robot_exists_p $db $prior_robot_id] {
ns_log Notice "Updating existing robot: $robot_id"
ns_db dml $db "update robots set robot_name = '[DoubleApos [ns_set get $robot "robot_name"]]', robot_details_url = '[DoubleApos [ns_set get $robot "robot_details_url"]]', robot_useragent = '[DoubleApos [ns_set get $robot "robot_useragent"]]' where robot_id = '[DoubleApos $prior_robot_id]'"
} else {
ns_log Notice "Inserting new robot: $robot_id"
ns_db dml $db "insert into robots(robot_id, robot_name, robot_details_url, robot_useragent) values('[DoubleApos $prior_robot_id]', '[DoubleApos [ns_set get $robot "robot_name"]]', '[DoubleApos [ns_set get $robot "robot_details_url"]]', '[DoubleApos [ns_set get $robot "robot_useragent"]]')"
}
incr robot_count
}
# Clear out the record so we can start anew.
#
ns_set delkey $robot "robot_id"
ns_set delkey $robot "robot_name"
ns_set delkey $robot "robot_details_url"
ns_set delkey $robot "robot_useragent"
}
ns_set put $robot "robot_id" [string trim $robot_id]
}
if [regexp "robot-name: *(.+)" $line match robot_name] {
ns_set put $robot "robot_name" [string trim $robot_name]
}
if [regexp "robot-details-url: *(.+)" $line match robot_details_url] {
ns_set put $robot "robot_details_url" [string trim $robot_details_url]
}
if [regexp "robot-useragent: *(.+)" $line match robot_useragent] {
ns_set put $robot "robot_useragent" [string trim $robot_useragent]
}
}
# Don't forget the last record.
#
if ![empty_string_p [ns_set get $robot "robot_useragent"]] {
if [robot_exists_p $db $prior_robot_id] {
ns_log Notice "Updating existing robot: $robot_id"
ns_db dml $db "update robots set robot_name = '[DoubleApos [ns_set get $robot "robot_name"]]', robot_details_url = '[DoubleApos [ns_set get $robot "robot_details_url"]]', robot_useragent = '[DoubleApos [ns_set get $robot "robot_useragent"]]', insertion_date = sysdate where robot_id = '[DoubleApos $prior_robot_id]'"
} else {
ns_log Notice "Inserting new robot: $robot_id"
ns_db dml $db "insert into robots(robot_id, robot_name, robot_details_url, robot_useragent) values('[DoubleApos $prior_robot_id]', '[DoubleApos [ns_set get $robot "robot_name"]]', '[DoubleApos [ns_set get $robot "robot_details_url"]]', '[DoubleApos [ns_set get $robot "robot_useragent"]]')"
}
incr robot_count
}
return $robot_count