Class: Rspider::DataWasher

Inherits:
Object
  • Object
show all
Defined in:
lib/rspider/DataWasher.rb

Instance Method Summary collapse

Instance Method Details

#getDiffRows(exampleFile, dataFile) ⇒ Object

根据文章diff的结果取回标题和内容.



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/rspider/DataWasher.rb', line 4

def getDiffRows(exampleFile,dataFile)
	diff_res=%x{diff #{exampleFile} #{dataFile}}
	lines=diff_res.split("\n")
	row_id=1
	rows=[]
	cache=""
	lines.each{|l|
		if	(l[0,2] == "--")
		elsif(l[0,1] == "<")
		elsif(l[0,1] == ">")
			cache= cache + l[1,l.length]+"\n"
		else
			rows.push cache
			cache=""
		end
	}
	rows.push cache
	rows	
end

#parseDir(srcDir, destDir) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/rspider/DataWasher.rb', line 24

def parseDir(srcDir,destDir)
	puts "now:parse Directory:#{srcDir}"
	files=[]
	Dir.foreach(srcDir){|f|
		files <<  f unless f == "." or f == ".."	
	}	
	l=files.length
	first=files[0]
	last=files[l-1]
	k=0
	if $ENV == "PRO" 
		files.each{|f|
			if (k==0)
				w=parseText(srcDir+last,srcDir+f) unless File.file?(destDir+f)
				open(destDir+f,"w+").puts  w unless w.nil?
			else
				w=parseText(srcDir+first,srcDir+f) unless File.file?(destDir+f)
				open(destDir+f,"w+").puts  w unless w.nil?
			end
			k=k+1
		}	
	else
		i=0
		files.each{|f|
			if (i>3)
				break
			end
			diffRows=[]
			if (k==0)
				diffRows= getDiffRows(srcDir+last,srcDir+f)
			else
				diffRows= getDiffRows(srcDir+first,srcDir+f)
			end
			puts "\n\n\n ==============Diff Rows[#{i}]================\n"
			x=0
			diffRows.each{ |l|
				puts "\n+ rows[#{x}]:\n"
				puts l
				x=x+1
			}
			k=k+1
			i=i+1
		}	
	end	
end

#parseText(exampleFile, dataFile) ⇒ Object

根据文章diff的结果取回标题和内容.



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/rspider/DataWasher.rb', line 103

def parseText(exampleFile,dataFile)
	if $_SOURCE == "hexun"
		return parseTextHexun(exampleFile,dataFile)
	else
		puts "not hexun"
	end
	rows=getDiffRows(exampleFile,dataFile)
	i=0
	cur=0
	rows.each{ |l|
		if(l.length>150 )
			cur=i
			break
		end
		i=i+1
	}
	if cur==0 
		return nil
	end
	returns=""
	returns << rows[1]
	returns << "::==++\n"
	returns << rows[cur]
	returns 
end

#parseTextHexun(exampleFile, dataFile) ⇒ Object

根据文章diff的结果取回标题和内容(针对和讯理财)



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/rspider/DataWasher.rb', line 71

def parseTextHexun(exampleFile,dataFile)
	rows=getDiffRows(exampleFile,dataFile)
	i=0
	cur=0
	contents=[]
	rows.each{ |l|
		if l =~ %r{\s*进入.*吧} 
			puts "got the end of content;#{l}"
			break
		end
		if l =~ %r{^\s*[\d]{1}\*}
			next
		end
		if l =~ %r{^\s*上一页\s*}
			next
		end
		if l =~ %r{^\s*下一页\s*}
			next
		end
		#if l =~ %r{^\s*第[\d]页} and l.length()<25 
		#	next
		#end
		contents.push l if i>3 
		i = i+1
	}
	returns=""
	returns << rows[1].sub("-理财频道-和讯网","")
	returns << "::==++\n"
	returns << contents.join("\n") 
	returns 
end