`
lovejuan1314
  • 浏览: 336384 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

将geoip,geoip_city数据导入Postgresql数据库

阅读更多
MaxMind公司提供的geoip数据格式有两种,一种是二进制文件,一种是csv文件,可能还有其他格式的,比如dotted octet format。本次总结只以csv文件举例。

1.从MaxMind得到csv文件,这个需要付费的。
2.然后将得到的文件解压缩到服务器特定目录。
3.安装PostgresSQL数据库,并且安装ip4r
  (可以从网上下载postgresql-distrib-ip4r包安装)
4.安装python以及Psycopg2 
   1)yum list "*psycopg*"
   2)或者直接用wget ftp://ftp.muug.mb.ca/mirror/fedora/epel/5/i386/python-psycopg2-2.0.7-1.el5.i386.rpm得到
   3)rpm -Uvh psycopg2-2.0.7-1.el5.i386.rpm
5. 准备python导入脚本文件
'''
Script for loading GeoIP CSV data into a postgresql database
'''

import logging, psycopg2, psycopg2.extensions, sys

from optparse import OptionGroup, OptionParser
from StringIO import StringIO

class GeoIPDataLoader(object):

    def __init__(self, dsn, blocks='GeoLiteCity-Blocks.csv', locations='GeoLiteCity-Location.csv', schema='public'):
        self.con = psycopg2.connect(dsn)
        # We don't need transactions... right?
        self.con.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        # The data is in ISO8859_15 encoding
        self.con.set_client_encoding('iso8859_15')
        self.cur = self.con.cursor()

        self.blocks_csv = blocks
        self.location_csv = locations
        self.schema = schema
 
    def close(self):
        self.con.close()

    def create_tables(self):
        print 'Creating structure...',
        self.db_execute(
            '''
                CREATE TABLE locations
                (
                  id bigint NOT NULL,
                  country character(2) NOT NULL,
                  region character(2),
                  city character varying(75),
                  postal_code character varying(15),
                  latitude numeric(6,4) NOT NULL,
                  longitude numeric(7,4),
                  metro_code integer,
                  area_code integer,
                  CONSTRAINT locations_pkey PRIMARY KEY (id)
                );
                
                CREATE TABLE blocks
                (
                  start_ip bigint NOT NULL,
                  end_ip bigint NOT NULL,
                  location_id bigint NOT NULL
                );
               
            '''
            )
        print '\033[1;32mDone\033[1;m'
                
    def create_indexes(self, ip4=False):
        print 'Adding Indexes...',
        sys.stdout.flush()
        if not ip4:
            self.db_execute('''
             CREATE INDEX ix_start_end_ip ON blocks 
                USING btree (start_ip, end_ip) WITH (FILLFACTOR=100);
             CREATE INDEX ix_end_start_ip ON blocks 
                USING btree (end_ip, start_ip) WITH (FILLFACTOR=100); 
                ''')
        else:
            self.db_execute('''
                 CREATE INDEX ix_ip_range ON blocks
                   USING gist (ip_range) WITH (FILLFACTOR=100);
                ''')
        print '\033[1;32mDone\033[1;m'
        
    def create_functions(self, ip4=False):
        print 'Adding utility functions...',
        sys.stdout.flush()
        if ip4:
            self.db_execute('''
                CREATE OR REPLACE FUNCTION get_location(inet) RETURNS bigint AS $$
                  SELECT location_id FROM %s.blocks
                  WHERE ip_range >>= ip4($1)
                $$ LANGUAGE SQL;
                ''' % self.schema)
        else:
            self.db_execute('''
                CREATE OR REPLACE FUNCTION inet_to_bigint(inet) RETURNS bigint AS $$
                    SELECT $1 - inet '0.0.0.0'
                $$ LANGUAGE SQL;
                ''' % self.schema)
        print '\033[1;32mDone\033[1;m'
    
    def create_schema(self):
        try:
            self.db_execute('''CREATE SCHEMA %s;''' % self.schema)
        except psycopg2.ProgrammingError:
          pass   

        self.db_execute('SET search_path TO %s,public;' % self.schema)
        
    def db_execute(self, ddl):
        self.cur.execute(ddl)
#        self.con.commit()
    
    def load_data(self):
        # Load Locations
        self.load_table(self.location_csv, 'locations')
        # Load Blocks
        self.load_table(self.blocks_csv, 'blocks')
    
    def load_table(self, file_name, table_name):
        print 'Loading table \033[1;34m%s\033[1;m from file \033[1;34m%s\033[1;m...' % (table_name, file_name),
        sys.stdout.flush()
        geo_file = open(file_name)
        # Skip the copyright header
        geo_file.readline()
        geo_file.readline()
        #Remove quotes... psycopg2's `copy` errors on them
        string_data = geo_file.read().replace('"', '')
        self.cur.copy_from(StringIO(string_data), table_name,  sep=',', null='')
        print '\033[1;32mDone\033[1;m'
    
    def migrate_to_ip4(self):
        print 'Adding ip_range column'        
        self.db_execute('''
                        ALTER TABLE blocks ADD COLUMN ip_range ip4r;
                        ALTER TABLE blocks ALTER COLUMN ip_range SET STORAGE PLAIN;
                        ''')
        
        print 'Migrating data to ip4...',
        sys.stdout.flush()
        self.db_execute('''UPDATE blocks SET ip_range = ip4r(start_ip::ip4, end_ip::ip4)''')
        print '\033[1;32mDone\033[1;m'

        print 'Dropping unneeded columns'
        self.db_execute('''
                        ALTER TABLE blocks DROP COLUMN start_ip;
                        ALTER TABLE blocks DROP COLUMN end_ip;
                        ''')
    def vacuum(self):
        print 'Vaccuming database...',
        sys.stdout.flush()
        self.db_execute('VACUUM FULL ANALYZE')
        print '\033[1;32mDone\033[1;m'

def main():
    DSN = "dbname='%s' user='%s' host='%s'"

    parser = OptionParser()
    # Operational options
    parser.add_option('-c', '--load-ddl', dest='load_ddl', default=False,
                      action='store_true', help='Create database structure')
   
    parser.add_option('-g', '--load-data', dest='load', default=False,
                      action='store_true', help='Load the GeoIP data')

    parser.add_option('-b', '--blocks-file', dest='blocks_csv', default='GeoLiteCity-Blocks.csv',
                      action='store', help='GeoIP Blocks CSV file [default: %default]', metavar='BLOCKS_FILE')
    parser.add_option('-l', '--locations-file', dest='locations_csv', default='GeoLiteCity-Location.csv',
                      action='store', help='GoeIP Locations CSV file [default: %default]', metavar='LOCATIONS_FILE')

    db_group = OptionGroup(parser, 'Database Options')
    # Database options
    db_group.add_option('-H', '--host', dest='db_host', default='localhost',
                      action='store', help='Database host [default: %default]', metavar='DB_HOST')
    db_group.add_option('-d', '--database', dest='db_name', default='geoip_db',
                      action='store', help='Database name [default: %default]', metavar='DATABASE_NAME')
    db_group.add_option('-U', '--user', dest='db_user', default='geoip',
                      action='store', help='User [default: %default]', metavar='USER_NAME')
     db_group.add_option('-W','--prompt',dest='password',default='',
                      action='store',help='Password',metavar='PASSWORD')

    db_group.add_option('-s', '--schema', dest='schema', default='public',
                      action='store', help='Database Schema [default: %default]', metavar='SCHEMA')

    db_group.add_option('--ip4r', dest='ip4', default=False,
                      action='store_true', help='Use IP4r module [default: %default]')

    parser.add_option_group(db_group)
    
    (options, args) = parser.parse_args()

     data_loader = GeoIPDataLoader("dbname='%s' user='%s' password='%s'  host='%s'" % (options.db_name, options.db_user,options.password, options.db_host),
                                  blocks=options.blocks_csv, locations=options.locations_csv, schema=options.schema)

    if not options.load_ddl and not options.load:
        parser.print_help()
        return

    if options.load_ddl:
        if options.schema != 'public':
            data_loader.create_schema()
        data_loader.create_tables()
 
    if options.load:
        data_loader.load_data()

    if options.ip4:
        data_loader.migrate_to_ip4()

    if options.load:
        data_loader.create_indexes(options.ip4 is True)

    if options.load_ddl:
        data_loader.create_functions(options.ip4 is True)

    data_loader.vacuum()

if __name__ == "__main__":
    main()



注意:
1.该文件中创建索引使用的WITH选项是PostgresSQL 8.2以上版本才支持的。
2.我添加了数据库密码验证,要不数据库密码验证通不过。

6: 创建表

--DROP TABLE locations;
--DROP TABLE blocks;

-- Table DDL
CREATE TABLE locations
(
  id bigint NOT NULL,
  country character(2) NOT NULL,
  region character(2),
  city character varying(75),
  postal_code character varying(15),
  latitude numeric(6,4) NOT NULL,
  longitude numeric(7,4),
  metro_code integer,
  area_code integer,
  CONSTRAINT locations_pkey PRIMARY KEY (id)
);

CREATE TABLE blocks
(
  start_ip bigint NOT NULL,
  end_ip bigint NOT NULL,
  location_id bigint NOT NULL,
  CONSTRAINT blocks_pkey PRIMARY KEY (start_ip, end_ip)
);


7: 使用脚本命令导入
chmod +x /tmp/load_geoip.py
python /tmp/load_geoip.py --ip4r  -l /tmp/GeoIPCity-134-Location.csv -b /tmp/GeoIPCity-134-Blocks.csv --host 127.0.0.1 -U slony  -W 12345 -d BeboAndFacebook -g


8. 如果是更新的话,要考虑slony集群环境,先用slonik在主数据库服务器上把表清空,然后再导入新的数据。

参考文章:http://www.siafoo.net/article/53
2
0
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics